L'objectif du projet est de segmenter des clients du site e-commerce, ce qu'inclu :
Dans le premier notebook du projet on a consolidé les informations sur les commandes et les clients dans deux datasets. On a aussi fait un nettoyage et un exploration des données pour ce préparer au modélisation. Dans ce notebook, on test plusieurs modèles avec des différents features et nombre de clusters.
#imports: regular expresssions, operating system, math operations
#import re,os,math
import re, os, random, datetime
#from math import sin, cos, sqrt, atan2, radians, degrees
#from cmath import rect, phase
from ast import literal_eval
#data modules
import numpy as np
import pandas as pd
import scipy as sp
#graphic modules
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.colors import ListedColormap
import seaborn as sns
#les functions de sklearn
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, FactorAnalysis, KernelPCA
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch, DBSCAN
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding, TSNE
from sklearn.mixture import GaussianMixture
from sklearn import metrics
#Le module FactorAnalyzer
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
sns.set() #sets the theme of seaborn
#creates static png images of the plots within the notebook (other option: 'notebook' for interactive plots)
%matplotlib inline
PATH_TO_DATA = os.getcwd() + '\\DataCleaned\\'
customers_df = pd.read_csv(PATH_TO_DATA + 'Olist_Cleaned_Customers' + '.csv')
customers_df
| Unnamed: 0 | customer_unique_id | last_purchase_date | recency | number_of_purchases | repeat_customer | monetary | online_timeofday | purchase_timeofday | review_timeofday | ... | used_voucher | used_boleto | used_card | order_payment_total | review_score | review_response_time | has_review_title | has_review_message | has_review | review_response_speed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 861eff4711a542e4b93843c6dd7febb0 | 2017-05-16 15:05:35 | -519.100498 | 1 | False | 146.87 | 18.833333 | 15.083333 | 22.566667 | ... | 0.0 | 0.0 | 1.0 | 146.87 | 4.0 | 4.940741 | 0.0 | 0.0 | 1.0 | 0.168329 |
| 1 | 1 | 290c77bc529b7ac935b93aa66c333dc3 | 2018-01-12 20:48:24 | -277.862431 | 1 | False | 335.48 | 21.750000 | 20.800000 | 22.716667 | ... | 0.0 | 0.0 | 1.0 | 335.48 | 5.0 | 11.946863 | 0.0 | 0.0 | 1.0 | 0.077239 |
| 2 | 2 | 060e732b5b29e8181a18229c7b0b2b5e | 2018-05-19 16:07:45 | -151.057326 | 1 | False | 157.73 | 14.150000 | 16.116667 | 12.166667 | ... | 0.0 | 0.0 | 1.0 | 157.73 | 5.0 | 0.507627 | 0.0 | 0.0 | 1.0 | 0.663294 |
| 3 | 3 | 259dac757896d24d7702b9acbbff3f3c | 2018-03-13 16:06:38 | -218.058102 | 1 | False | 173.30 | 17.350000 | 16.100000 | 18.600000 | ... | 0.0 | 0.0 | 1.0 | 173.30 | 5.0 | 4.775544 | 0.0 | 0.0 | 1.0 | 0.173144 |
| 4 | 4 | 345ecd01c38d18a9036ed96c73b8d066 | 2018-07-29 09:51:30 | -80.318611 | 1 | False | 252.25 | 5.916667 | 9.850000 | 25.983333 | ... | 0.0 | 0.0 | 1.0 | 252.25 | 5.0 | 7.083241 | 1.0 | 1.0 | 1.0 | 0.123713 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 96084 | 96084 | 1a29b476fee25c95fbafc67c5ac95cf8 | 2018-04-07 15:48:17 | -193.070845 | 1 | False | 88.78 | 13.516667 | 15.800000 | 11.250000 | ... | 0.0 | 0.0 | 1.0 | 88.78 | 4.0 | 14.468796 | 1.0 | 0.0 | 1.0 | 0.064646 |
| 96085 | 96085 | d52a67c98be1cf6a5c84435bd38d095d | 2018-04-04 08:20:22 | -196.381898 | 1 | False | 129.06 | 9.466667 | 8.333333 | 10.600000 | ... | 0.0 | 0.0 | 1.0 | 129.06 | 5.0 | 4.441725 | 0.0 | 0.0 | 1.0 | 0.183765 |
| 96086 | 96086 | e9f50caf99f032f0bf3c55141f019d99 | 2018-04-08 20:11:50 | -191.887824 | 1 | False | 56.04 | 22.933333 | 20.183333 | 25.683333 | ... | 0.0 | 0.0 | 1.0 | 56.04 | 1.0 | 4.070220 | 1.0 | 1.0 | 1.0 | 0.197230 |
| 96087 | 96087 | 73c2643a0a458b49f58cea58833b192e | 2017-11-03 21:08:33 | -347.848438 | 1 | False | 711.07 | 22.350000 | 21.133333 | 23.566667 | ... | 0.0 | 0.0 | 1.0 | 711.07 | 5.0 | 2.982153 | 0.0 | 0.0 | 1.0 | 0.251120 |
| 96088 | 96088 | 84732c5050c01db9b23e19ba39899398 | 2017-12-19 14:27:23 | -302.127025 | 1 | False | 21.77 | 18.900000 | 14.450000 | 23.350000 | ... | 0.0 | 0.0 | 1.0 | 21.77 | 5.0 | 0.972986 | 0.0 | 1.0 | 1.0 | 0.506846 |
96089 rows × 40 columns
customers_df.drop(columns=['Unnamed: 0'],inplace=True)
customers_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 96089 entries, 0 to 96088 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_unique_id 96089 non-null object 1 last_purchase_date 96089 non-null object 2 recency 96089 non-null float64 3 number_of_purchases 96089 non-null int64 4 repeat_customer 96089 non-null bool 5 monetary 96089 non-null float64 6 online_timeofday 96089 non-null float64 7 purchase_timeofday 96089 non-null float64 8 review_timeofday 96089 non-null float64 9 delivery_time 96089 non-null float64 10 delivery_speed 96089 non-null float64 11 delivery_delay 96089 non-null float64 12 order_purchase_timeofday 96089 non-null float64 13 order_is_delivered 96089 non-null float64 14 customer_lat 96089 non-null float64 15 customer_lng 96089 non-null float64 16 customer_zip_code_density 96089 non-null float64 17 customer_zip_code_density_3digits 96089 non-null float64 18 order_item_quantity 96089 non-null float64 19 has_multiple_items 96089 non-null float64 20 order_price 96089 non-null float64 21 order_freight 96089 non-null float64 22 order_total_price 96089 non-null float64 23 order_freight_frac 96089 non-null float64 24 order_avg_shipping_distance 96089 non-null float64 25 order_total_weight 96089 non-null float64 26 order_total_volume 96089 non-null float64 27 order_payments_quantity 96089 non-null float64 28 order_installments_total 96089 non-null float64 29 used_voucher 96089 non-null float64 30 used_boleto 96089 non-null float64 31 used_card 96089 non-null float64 32 order_payment_total 96088 non-null float64 33 review_score 96089 non-null float64 34 review_response_time 96089 non-null float64 35 has_review_title 96089 non-null float64 36 has_review_message 96089 non-null float64 37 has_review 96089 non-null float64 38 review_response_speed 96089 non-null float64 dtypes: bool(1), float64(35), int64(1), object(2) memory usage: 27.9+ MB
seed = 42
#Creating the projections takes the most time, so one might not want to run them. Especially, if one has already done so.
global_do_projections = True
all_my_models = {}
all_my_projections = {}
logscale_transformer = FunctionTransformer(np.log, inverse_func = np.exp, check_inverse = True)
log1pscale_transformer = FunctionTransformer(np.log1p, inverse_func = np.expm1, check_inverse = True)
def create_preprocessor_X(logscale_features = [], standardscale_features = [], log1pscale_features = []) :
preprocessor_X = ColumnTransformer(transformers = [
('logscale', Pipeline([('log', logscale_transformer),
('standard', StandardScaler())]),
logscale_features),
('log1pscale', Pipeline([('log1p', log1pscale_transformer),
('standard', StandardScaler())]),
log1pscale_features),
('standardscale', StandardScaler(), standardscale_features),
], remainder = 'passthrough')
return preprocessor_X
def test_model(X_scaled, model_type = "KMeans"):
'''Run the Kmeans (or others) model with different cluster sizes and output clustering metrics in form of a dictionnary.'''
model_dict = {}
model_list = []
score = []
silhouette = []
calinski_harabasz = []
davies_bouldin = []
n_clusters = []
for i in range(3,20):
if model_type == "KMeans":
model = KMeans(n_clusters=i, random_state=33)
elif model_type == "GaussianMixture":
model = GaussianMixture(n_components=i, random_state=33)
elif model_type == "Birch":
model = Birch(n_clusters=i)
model_list.append(model)
X_labels = model.fit_predict(X_scaled)
if model_type in ["KMeans", "GaussianMixture"]:
score.append(model.score(X_scaled))
else :
score.append(0)
silhouette.append(metrics.silhouette_score(X_scaled, X_labels))
calinski_harabasz.append(metrics.calinski_harabasz_score(X_scaled, X_labels))
davies_bouldin.append(metrics.davies_bouldin_score(X_scaled, X_labels))
n_clusters.append(i)
model_dict["models"] = model_list
model_dict["n_clusters"] = n_clusters
model_dict["score"] = score
model_dict["silouette_score"] = silhouette
model_dict["c_h_score"] = calinski_harabasz
model_dict["d_b_score"] = davies_bouldin
return model_dict
def create_projections_dict(projection_version, X_labels, X_labels_reduced):
projections_dict = {}
for key, proj in all_my_projections[projection_version].items():
if key not in ["MDS"]:
projections_dict[key] = (proj["X_proj"], X_labels)
else:
projections_dict[key] = (proj["X_proj"], X_labels_reduced)
return projections_dict
def display_scree_plot(pca, name='',savefigure = False):
'''Display a scree plot for the pca'''
scree = pca.explained_variance_ratio_*100
plt.bar(np.arange(len(scree))+1, scree)
plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
plt.xlabel("Rang de l'axe d'inertie")
plt.ylabel("Pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
#plt.title("Scree plot")
if savefigure :
plt.savefig(os.getcwd() + '\\Figures\\' + name +'_ScreePlot', dpi = 200)
plt.show(block=False)
def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
"""Display correlation circles, one for each factorial plane"""
# For each factorial plane
for d1, d2 in axis_ranks:
if d2 < n_comp:
# Initialise the matplotlib figure
fig, ax = plt.subplots(figsize=(10,10))
# Determine the limits of the chart
if lims is not None :
xmin, xmax, ymin, ymax = lims
elif pcs.shape[1] < 30 :
xmin, xmax, ymin, ymax = -1, 1, -1, 1
else :
xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])
# Add arrows
# If there are more than 30 arrows, we do not display the triangle at the end
if pcs.shape[1] < 30 :
plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
pcs[d1,:], pcs[d2,:],
angles='xy', scale_units='xy', scale=1, color="grey")
# (see the doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
else:
lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
# Display variable names
if labels is not None:
for i,(x, y) in enumerate(pcs[[d1,d2]].T):
if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
plt.text(x, y, labels[i], fontsize='14', ha='center', va='center',
rotation=label_rotation, color="blue", alpha=0.5)
# Display circle
circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
plt.gca().add_artist(circle)
# Define the limits of the chart
#plt.xlim(xmin, xmax)
#plt.ylim(ymin, ymax)
# Display grid lines
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')
# Label the axes, with the percentage of variance explained
plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))
plt.title("Correlation Circle (PC{} and PC{})".format(d1+1, d2+1))
plt.show(block=False)
def plot_model_evaluation(model_dict, figsize=(16,8)):
'''Plot the Clustering Metrics saved in a dictionary'''
n_clusters = model_dict["n_clusters"]
fig, axs = plt.subplots(2,2, figsize=figsize)
#the smaller the better
axs[0,0].plot(n_clusters, model_dict['score'])
axs[0,0].set_title("Score", y=0.9)
#The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering.
#Scores around zero indicate overlapping clusters.
axs[0,1].plot(n_clusters, model_dict['silouette_score'])
axs[0,1].set_title("Silouhette", y=0.9)
#Zero is the lowest possible score. Values closer to zero indicate a better partition.
axs[1,0].plot(n_clusters, model_dict['d_b_score'])
axs[1,0].set_title("Davies Bouldin", y=0.9)
axs[1,0].set_xlabel("Nombre de Clusters")
#The score is higher when clusters are dense and well separated,
#which relates to a standard concept of a cluster
axs[1,1].plot(n_clusters, model_dict['c_h_score'])
axs[1,1].set_title("Calinski Harabasz", y=0.9)
axs[1,1].set_xlabel("Nombre de Clusters")
fig.tight_layout()
plt.show()
def plot_model_2D_projections(projections_dict):
'''Plot the Clustering in a 2D projections'''
x_total = (len(projections_dict) // 2)
fig, axs = plt.subplots(x_total, 2, figsize=(16,8*x_total))
for i, (key, proj) in enumerate(projections_dict.items()):
x = i // 2
y = i % 2
X_proj, X_labels = proj
axs[x,y].scatter(X_proj[:,0], X_proj[:,1], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
axs[x,y].set_title(key)
axs[x,y].set_xlabel('Composante #1')
axs[x,y].set_ylabel('Composante #2')
fig.tight_layout()
plt.show()
X = customers_df.copy()
features_ordered = ['used_voucher', 'used_boleto', 'used_card',
'has_multiple_items', 'repeat_customer',
'monetary', 'order_payment_total', 'order_is_delivered',
'delivery_time', 'delivery_speed', 'delivery_delay', 'order_avg_shipping_distance',
'customer_lat', 'customer_lng', 'customer_zip_code_density',
'recency', 'online_timeofday', 'purchase_timeofday','review_timeofday',
'review_response_speed', 'has_review', 'has_review_title', 'has_review_message', 'review_score',]
corr = X[features_ordered].dropna().corr()
fig, ax = plt.subplots(figsize = (16,14))
sns.heatmap(corr, center = 0, cmap = 'bwr', annot = True, cbar = True)
<AxesSubplot:>
passthrough_features =[]
standardscale_features = ['delivery_speed', 'review_response_speed', 'has_review_title', 'has_review_message',
'has_multiple_items', 'repeat_customer', 'used_voucher', 'used_boleto']
standardscale_features += ['recency', 'purchase_timeofday', 'review_timeofday',
'delivery_delay', 'review_score']
log1pscale_features = ['order_payment_total', 'order_avg_shipping_distance']
logscale_features = ['customer_zip_code_density']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
all_components = len(X_scaled.columns)
X_scaled.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 96088 entries, 0 to 96088 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_zip_code_density 96088 non-null float64 1 order_payment_total 96088 non-null float64 2 order_avg_shipping_distance 96088 non-null float64 3 delivery_speed 96088 non-null float64 4 review_response_speed 96088 non-null float64 5 has_review_title 96088 non-null float64 6 has_review_message 96088 non-null float64 7 has_multiple_items 96088 non-null float64 8 repeat_customer 96088 non-null float64 9 used_voucher 96088 non-null float64 10 used_boleto 96088 non-null float64 11 recency 96088 non-null float64 12 purchase_timeofday 96088 non-null float64 13 review_timeofday 96088 non-null float64 14 delivery_delay 96088 non-null float64 15 review_score 96088 non-null float64 dtypes: float64(16) memory usage: 12.5 MB
# Create the PCA model
pca = PCA(n_components=all_components)
# Fit the model with the standardised data
X_pca = pca.fit_transform(X_scaled)
display_scree_plot(pca, name='PCA')
# Generate a correlation circle
display_circles(pca.components_[:,0:7], all_components, pca, [(0,1)], labels = np.array(X_scaled.columns),)
# Generate a correlation circle
display_circles(pca.components_[:,7:], all_components, pca, [(0,1)], labels = np.array(X_scaled.columns),)
comps = pd.DataFrame(pca.components_, columns=X_scaled.columns,
index=['Component ' + str(i+1) for i in range(pca.components_.shape[0])]).T
fig, ax = plt.subplots(figsize = (13,10))
sns.heatmap(comps, center = 0, cmap = 'bwr', annot = True, cbar = True)
<AxesSubplot:>
plt.scatter(X_pca[:,0], X_pca[:,1], s=1)
plt.title('PCA')
plt.xlabel('Composante #1')
plt.ylabel('Composante #2')
Text(0, 0.5, 'Composante #2')
plt.scatter(X_pca[:,2], X_pca[:,3], s=1)
plt.title('PCA')
plt.xlabel('Composante #3')
plt.ylabel('Composante #4')
Text(0, 0.5, 'Composante #4')
#CHECK ADEQUACY
#Bartlett
#p-value should be 0 (statistically sig.)
chi_square_value,p_value=calculate_bartlett_sphericity(X_scaled)
print(chi_square_value, p_value)
#KMO
#Value should be 0.6<
kmo_all,kmo_model=calculate_kmo(X_scaled)
print(kmo_model)
136221.44459931107 0.0 0.505998521198658
passthrough_features =[]
passthrough_features = ['delivery_speed', 'review_response_speed',
'repeat_customer', 'used_voucher', 'used_boleto']
standardscale_features = ['recency', 'purchase_timeofday', 'review_timeofday',
'delivery_delay', 'review_score']
log1pscale_features = ['order_payment_total']
logscale_features = ['customer_zip_code_density']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
all_components = len(X_scaled.columns)
#CHECK ADEQUACY
#Bartlett
#p-value should be 0 (statistically sig.)
chi_square_value,p_value=calculate_bartlett_sphericity(X_scaled)
print(chi_square_value, p_value)
#KMO
#Value should be 0.6<
kmo_all,kmo_model=calculate_kmo(X_scaled)
print(kmo_model)
53114.511823061366 0.0 0.6108624405910011
c:\users\bookj\environments\jupenv\lib\site-packages\factor_analyzer\utils.py:249: UserWarning: The inverse of the variance-covariance matrix was calculated using the Moore-Penrose generalized matrix inversion, due to its determinant being at or very close to zero.
warnings.warn('The inverse of the variance-covariance matrix '
fa = FactorAnalyzer(all_components, rotation=None)
fa.fit(X_scaled)
ev, v = fa.get_eigenvalues()
plt.scatter(range(1,X_scaled.shape[1]+1),ev)
plt.plot(range(1,X_scaled.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.show()
#varimax, promax, oblimin, oblimax, quartimin, quartimax, equamax
fa = FactorAnalyzer(1, rotation='varimax')
fa.fit(X_scaled)
ev, v = fa.get_eigenvalues()
c:\users\bookj\environments\jupenv\lib\site-packages\factor_analyzer\factor_analyzer.py:656: UserWarning: No rotation will be performed when the number of factors equals 1.
warnings.warn('No rotation will be performed when '
loadings = pd.DataFrame(fa.loadings_, columns=['Factor ' + str(i+1) for i in range(fa.loadings_.shape[1])],
index=X_scaled.columns)
loadings
| Factor 1 | |
|---|---|
| customer_zip_code_density | 0.051199 |
| order_payment_total | -0.115016 |
| recency | 0.149092 |
| purchase_timeofday | 0.002584 |
| review_timeofday | 0.121830 |
| delivery_delay | -0.584960 |
| review_score | 0.469513 |
| delivery_speed | 0.812062 |
| review_response_speed | -0.100882 |
| repeat_customer | -0.006086 |
| used_voucher | -0.014071 |
| used_boleto | -0.045828 |
pd.DataFrame(fa.get_communalities(), columns = ['Communality'], index=X_scaled.columns).sort_values(by='Communality',
ascending=False)
| Communality | |
|---|---|
| delivery_speed | 0.659445 |
| delivery_delay | 0.342178 |
| review_score | 0.220442 |
| recency | 0.022228 |
| review_timeofday | 0.014843 |
| order_payment_total | 0.013229 |
| review_response_speed | 0.010177 |
| customer_zip_code_density | 0.002621 |
| used_boleto | 0.002100 |
| used_voucher | 0.000198 |
| repeat_customer | 0.000037 |
| purchase_timeofday | 0.000007 |
#passthrough_features =[]
passthrough_features = ['delivery_speed', 'review_response_speed', 'has_review_title', 'has_review_message',
'has_multiple_items', 'repeat_customer', 'used_voucher', 'used_boleto']
standardscale_features = ['recency', 'purchase_timeofday', 'review_timeofday',
'delivery_delay', 'review_score']
log1pscale_features = ['order_payment_total', 'order_avg_shipping_distance']
logscale_features = ['customer_zip_code_density']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
all_components = len(X_scaled.columns)
seed=42
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
X_reduced
| customer_zip_code_density | order_payment_total | order_avg_shipping_distance | recency | purchase_timeofday | review_timeofday | delivery_delay | review_score | delivery_speed | review_response_speed | has_review_title | has_review_message | has_multiple_items | repeat_customer | used_voucher | used_boleto | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36909 | 0.127666 | -0.922211 | -0.210527 | 0.234129 | -1.112484 | -0.906439 | -0.006406 | -2.297030 | 0.247829 | 0.644757 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 54623 | -0.991452 | -0.153872 | 0.535400 | -0.388032 | 1.734555 | -2.076979 | -0.169253 | 0.689147 | 0.483281 | 0.159264 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 44651 | -1.284504 | 0.931002 | 1.160912 | 0.801317 | -1.098494 | 0.031187 | -0.148818 | -0.057397 | 0.382437 | 0.265302 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 26894 | -0.764144 | 1.516786 | 1.348365 | -0.728092 | 0.912621 | -1.423030 | -0.276617 | 0.689147 | 0.558551 | 0.411409 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 25782 | 0.209203 | 0.772265 | -2.524258 | 1.024893 | 0.387982 | -0.739220 | -0.080289 | 0.689147 | 0.581235 | 0.217858 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 36174 | 1.246784 | 0.857752 | 0.963983 | -0.370982 | -0.203111 | -0.760122 | -0.274935 | 0.689147 | 0.578661 | 0.386863 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 59655 | 1.188559 | 0.229236 | 0.805022 | 0.477146 | 0.286552 | -1.443932 | 1.999461 | -0.803942 | 0.000000 | 0.701173 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10329 | 0.540699 | -1.589324 | 0.310131 | 1.250232 | -1.801510 | 1.079297 | -0.102412 | 0.689147 | 0.463202 | 0.199480 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 40204 | -0.578419 | -1.019287 | 0.497497 | 0.741725 | -1.808505 | 0.804579 | -0.233658 | 0.689147 | 0.573870 | 0.513067 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 94988 | 0.745115 | -1.191893 | -0.437318 | 0.711943 | 0.366997 | 0.461181 | -0.234040 | 0.689147 | 0.626625 | 0.535028 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4804 rows × 16 columns
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
plt.scatter(X_reduced_mds[:,0], X_reduced_mds[:,1], s=1)
plt.title('MDS - avec 5% des Données')
plt.xlabel('Composante #1')
plt.ylabel('Composante #2')
Text(0, 0.5, 'Composante #2')
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
plt.scatter(X_reduced_iso[:,0], X_reduced_iso[:,1], s=1)
plt.title('Isomap - avec 5% des Données')
plt.xlabel('Composante #1')
plt.ylabel('Composante #2')
Text(0, 0.5, 'Composante #2')
lle = LocallyLinearEmbedding(n_components=2)
X_reduced_lle = lle.fit_transform(X_reduced)
plt.scatter(X_reduced_lle[:,0], X_reduced_lle[:,1], s=1)
plt.title('LocallyLinearEmbedding - avec 5% des Données')
plt.xlabel('Composante #1')
plt.ylabel('Composante #2')
Text(0, 0.5, 'Composante #2')
tsne = TSNE(n_components=2, init='pca')
X_reduced_tsne0 = tsne.fit_transform(X_reduced)
plt.scatter(X_reduced_tsne0[:,0], X_reduced_tsne0[:,1], s=1)
plt.title('t-SNE avec 5% des Données')
plt.xlabel('Composante #1')
plt.ylabel('Composante #2')
Text(0, 0.5, 'Composante #2')
import dill
dill.dump_session('notebook_envChap4.db')
import dill
dill.load_session('notebook_envChap4.db')
passthrough_features = ['repeat_customer']
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total']
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version0"
do_projection = True & global_do_projections
if do_projection :
all_my_projections[projection_version] = {}
if do_projection :
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
if do_projection :
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
if do_projection :
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
if do_projection :
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_0"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
model_dict = all_my_models["KMeans_0"]
plot_model_evaluation(model_dict, figsize=(8,6))
n_clusters = 4
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = "Version0"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| order_payment_total | recency | review_score | repeat_customer | fraction | |
|---|---|---|---|---|---|
| 0 | -0.697226 | 0.637927 | 0.392993 | 0.033510 | 0.332071 |
| 1 | -0.190163 | -1.211966 | 0.367662 | 0.028677 | 0.259824 |
| 2 | 0.148376 | -0.033087 | -1.975800 | 0.026260 | 0.162466 |
| 3 | 1.047444 | 0.441436 | 0.386822 | 0.033933 | 0.245639 |
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| order_payment_total | count | 31908.000000 | 24966.000000 | 15611.000000 | 23603.000000 |
| mean | -0.698896 | -0.187974 | 0.148614 | 1.045347 | |
| std | 0.570838 | 0.783241 | 0.973317 | 0.741670 | |
| min | -5.884510 | -2.653588 | -2.519897 | 0.107629 | |
| 25% | -1.118477 | -0.726414 | -0.505065 | 0.497342 | |
| 50% | -0.644293 | -0.179118 | 0.101527 | 0.825650 | |
| 75% | -0.229910 | 0.324253 | 0.686578 | 1.391565 | |
| max | 0.288640 | 3.187162 | 4.407333 | 6.011733 | |
| recency | count | 31908.000000 | 24966.000000 | 15611.000000 | 23603.000000 |
| mean | 0.636579 | -1.212574 | -0.033180 | 0.443975 | |
| std | 0.557691 | 0.561322 | 0.898035 | 0.695024 | |
| min | -0.794544 | -2.971724 | -3.159611 | -2.222140 | |
| 25% | 0.175074 | -1.617156 | -0.492680 | -0.077513 | |
| 50% | 0.669678 | -1.161434 | 0.088020 | 0.503350 | |
| 75% | 1.120939 | -0.767217 | 0.554422 | 1.017420 | |
| max | 1.611766 | -0.141610 | 1.878712 | 1.872943 | |
| review_score | count | 31908.000000 | 24966.000000 | 15611.000000 | 23603.000000 |
| mean | 0.392931 | 0.367502 | -1.975998 | 0.387010 | |
| std | 0.477097 | 0.500747 | 0.520286 | 0.485015 | |
| min | -2.297030 | -1.550486 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -2.297030 | -0.057397 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -1.550486 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.803942 | 0.689147 | |
| repeat_customer | count | 31908.000000 | 24966.000000 | 15611.000000 | 23603.000000 |
| mean | 0.033565 | 0.028679 | 0.026264 | 0.033852 | |
| std | 0.180110 | 0.166906 | 0.159923 | 0.180851 | |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (10,4))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
my_cmap = sns.color_palette('tab10')[:4]
fig, ax = plt.subplots(figsize = (6,6))
plt.scatter(X_scaled['recency'], X_scaled['order_payment_total'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 1], km.cluster_centers_[:, 0], s=100, c=my_cmap, edgecolors='black')
plt.xlabel('recency')
plt.ylabel('order_payment_total')
Text(0, 0.5, 'order_payment_total')
my_cmap = sns.color_palette('tab10')[:4]
fig, ax = plt.subplots(figsize = (6,6))
plt.scatter(X_scaled['review_score'], X_scaled['repeat_customer'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 2], km.cluster_centers_[:, 3], s=100, c=my_cmap, edgecolors='black')
plt.xlabel('review_score')
plt.ylabel('repeat_customer')
Text(0, 0.5, 'repeat_customer')
model_name = "GaussianMixture_0"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict, figsize=(8,6))
n_clusters = 4
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version0"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| order_payment_total | recency | review_score | repeat_customer | weights | |
|---|---|---|---|---|---|
| 0 | -0.029977 | -0.019311 | -0.057397 | 0.0 | 0.186929 |
| 1 | 0.040291 | 0.126782 | 0.022003 | 1.0 | 0.031180 |
| 2 | -0.028102 | 0.020980 | 0.689147 | 0.0 | 0.553700 |
| 3 | 0.087239 | -0.052411 | -1.628179 | 0.0 | 0.228192 |
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| order_payment_total | count | 1.796400e+04 | 2996.000000 | 5.320400e+04 | 21924.000000 |
| mean | -2.998651e-02 | 0.040291 | -2.810191e-02 | 0.087261 | |
| std | 9.844157e-01 | 0.849903 | 9.965222e-01 | 1.034319 | |
| min | -2.653588e+00 | -2.373319 | -2.936358e+00 | -5.884510 | |
| 25% | -7.128105e-01 | -0.535269 | -7.294342e-01 | -0.646413 | |
| 50% | -7.967820e-02 | -0.001765 | -7.531103e-02 | 0.031048 | |
| 75% | 5.570630e-01 | 0.556712 | 5.647607e-01 | 0.694327 | |
| max | 4.695665e+00 | 4.408529 | 5.163559e+00 | 6.011733 | |
| recency | count | 1.796400e+04 | 2996.000000 | 5.320400e+04 | 21924.000000 |
| mean | -1.931737e-02 | 0.126782 | 2.097949e-02 | -0.052409 | |
| std | 9.975993e-01 | 0.947539 | 1.014460e+00 | 0.970084 | |
| min | -2.971724e+00 | -2.947949 | -2.970209e+00 | -3.159611 | |
| 25% | -7.313301e-01 | -0.512982 | -7.287225e-01 | -0.665180 | |
| 50% | 9.696533e-02 | 0.258862 | 1.365878e-01 | 0.100045 | |
| 75% | 7.887259e-01 | 0.886689 | 8.570198e-01 | 0.650810 | |
| max | 1.553150e+00 | 1.872943 | 1.571963e+00 | 1.878712 | |
| review_score | count | 1.796400e+04 | 2996.000000 | 5.320400e+04 | 21924.000000 |
| mean | -5.739745e-02 | 0.022003 | 6.891468e-01 | -1.628362 | |
| std | 2.081726e-17 | 0.851067 | 2.220467e-16 | 0.690656 | |
| min | -5.739745e-02 | -2.297030 | 6.891468e-01 | -2.297030 | |
| 25% | -5.739745e-02 | -0.430670 | 6.891468e-01 | -2.297030 | |
| 50% | -5.739745e-02 | 0.315875 | 6.891468e-01 | -1.550486 | |
| 75% | -5.739745e-02 | 0.689147 | 6.891468e-01 | -0.803942 | |
| max | -5.739745e-02 | 0.689147 | 6.891468e-01 | 0.315875 | |
| repeat_customer | count | 1.796400e+04 | 2996.000000 | 5.320400e+04 | 21924.000000 |
| mean | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | |
| std | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | |
| min | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | |
| 25% | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | |
| 50% | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | |
| 75% | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | |
| max | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (10,4))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
n_clusters = 8
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version0"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| order_payment_total | recency | review_score | repeat_customer | weights | |
|---|---|---|---|---|---|
| 0 | -0.029987 | -0.019317 | -0.057397 | 0.0 | 0.186953 |
| 1 | 0.005531 | 0.117870 | 0.689147 | 1.0 | 0.014303 |
| 2 | 0.181741 | -0.040932 | -2.297030 | 0.0 | 0.110690 |
| 3 | -0.028102 | 0.020979 | 0.689147 | 0.0 | 0.553701 |
| 4 | 0.069751 | 0.134334 | -0.543425 | 1.0 | 0.016876 |
| 5 | -0.024402 | -0.065543 | -0.803942 | 0.0 | 0.086691 |
| 6 | 0.063293 | -0.056090 | -1.550486 | 0.0 | 0.030618 |
| 7 | -0.176526 | -0.166692 | -0.663950 | 0.0 | 0.000167 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|---|
| order_payment_total | count | 1.796400e+04 | 1.375000e+03 | 1.063600e+04 | 5.320400e+04 | 1621.000000 | 8.330000e+03 | 2942.000000 | 16.000000 |
| mean | -2.998651e-02 | 5.508900e-03 | 1.817408e-01 | -2.810191e-02 | 0.069795 | -2.440373e-02 | 0.063292 | -0.176070 | |
| std | 9.844157e-01 | 8.364339e-01 | 1.063533e+00 | 9.965222e-01 | 0.860319 | 9.881557e-01 | 1.021494 | 1.183945 | |
| min | -2.653588e+00 | -2.373319e+00 | -5.884510e+00 | -2.936358e+00 | -2.240456 | -5.884510e+00 | -2.477517 | -1.687815 | |
| 25% | -7.128105e-01 | -5.515666e-01 | -5.661769e-01 | -7.294342e-01 | -0.521430 | -7.297871e-01 | -0.664084 | -0.993451 | |
| 50% | -7.967820e-02 | -6.085340e-02 | 1.111319e-01 | -7.531103e-02 | 0.031377 | -6.531832e-02 | 0.021524 | -0.392363 | |
| 75% | 5.570630e-01 | 5.221448e-01 | 7.928207e-01 | 5.647607e-01 | 0.592272 | 5.731578e-01 | 0.672463 | 0.090106 | |
| max | 4.695665e+00 | 4.408529e+00 | 6.011733e+00 | 5.163559e+00 | 4.192449 | 5.162278e+00 | 4.536640 | 2.350199 | |
| recency | count | 1.796400e+04 | 1.375000e+03 | 1.063600e+04 | 5.320400e+04 | 1621.000000 | 8.330000e+03 | 2942.000000 | 16.000000 |
| mean | -1.931737e-02 | 1.179238e-01 | -4.093183e-02 | 2.097949e-02 | 0.134295 | -6.554275e-02 | -0.056091 | -0.166959 | |
| std | 9.975993e-01 | 9.613989e-01 | 9.556414e-01 | 1.014460e+00 | 0.935853 | 9.874385e-01 | 0.972891 | 0.843025 | |
| min | -2.971724e+00 | -2.926392e+00 | -3.159611e+00 | -2.970209e+00 | -2.947949 | -2.970562e+00 | -2.967066 | -1.784638 | |
| 25% | -7.313301e-01 | -5.665274e-01 | -5.550357e-01 | -7.287225e-01 | -0.475151 | -7.438980e-01 | -0.713488 | -0.795152 | |
| 50% | 9.696533e-02 | 2.479029e-01 | 1.323568e-01 | 1.365878e-01 | 0.268646 | 6.966796e-02 | 0.076218 | -0.105740 | |
| 75% | 7.887259e-01 | 9.146402e-01 | 5.986601e-01 | 8.570198e-01 | 0.856937 | 6.990590e-01 | 0.680597 | 0.446106 | |
| max | 1.553150e+00 | 1.872943e+00 | 1.878712e+00 | 1.571963e+00 | 1.787829 | 1.609604e+00 | 1.554111 | 1.257407 | |
| review_score | count | 1.796400e+04 | 1.375000e+03 | 1.063600e+04 | 5.320400e+04 | 1621.000000 | 8.330000e+03 | 2942.000000 | 16.000000 |
| mean | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | -0.543896 | -8.039417e-01 | -1.550486 | -0.663965 | |
| std | 2.081726e-17 | 1.110627e-16 | 8.882202e-16 | 2.220467e-16 | 0.800550 | 1.110290e-16 | 0.000000 | 0.706593 | |
| min | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | -2.297030 | -8.039417e-01 | -1.550486 | -1.923758 | |
| 25% | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | -0.803942 | -8.039417e-01 | -1.550486 | -1.177214 | |
| 50% | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | -0.430670 | -8.039417e-01 | -1.550486 | -0.803942 | |
| 75% | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | -0.057397 | -8.039417e-01 | -1.550486 | -0.244034 | |
| max | -5.739745e-02 | 6.891468e-01 | -2.297030e+00 | 6.891468e-01 | 0.564723 | -8.039417e-01 | -1.550486 | 0.315875 | |
| repeat_customer | count | 1.796400e+04 | 1.375000e+03 | 1.063600e+04 | 5.320400e+04 | 1621.000000 | 8.330000e+03 | 2942.000000 | 16.000000 |
| mean | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| std | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| min | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| 25% | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| 50% | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| 75% | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | |
| max | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_0"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict, figsize=(8,6))
n_clusters = 7
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projection_version = "Version0"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|---|
| order_payment_total | count | 24601.000000 | 5768.000000 | 8954.000000 | 15550.000000 | 623.000000 | 2196.000000 | 38396.000000 |
| mean | 0.925908 | 1.143896 | -0.844400 | -0.385736 | 2.655210 | 0.848150 | -0.503543 | |
| std | 0.712493 | 0.831204 | 0.509346 | 0.632186 | 0.661806 | 0.688441 | 0.659949 | |
| min | -1.102068 | 0.056348 | -5.884510 | -2.653588 | 1.615346 | -0.490326 | -2.936358 | |
| 25% | 0.448991 | 0.546852 | -1.190361 | -0.799299 | 2.148786 | 0.293494 | -1.014719 | |
| 50% | 0.823092 | 0.867111 | -0.812159 | -0.383549 | 2.512583 | 0.784747 | -0.453333 | |
| 75% | 1.308895 | 1.479390 | -0.487416 | 0.052068 | 3.123463 | 1.330449 | 0.032857 | |
| max | 4.673832 | 5.163559 | 0.435257 | 1.229602 | 6.011733 | 3.166997 | 0.765885 | |
| recency | count | 24601.000000 | 5768.000000 | 8954.000000 | 15550.000000 | 623.000000 | 2196.000000 | 38396.000000 |
| mean | 0.501572 | -1.349789 | 0.319626 | -1.388917 | -0.121746 | -1.287654 | 0.444986 | |
| std | 0.607100 | 0.535105 | 0.640625 | 0.496509 | 0.647441 | 0.583193 | 0.684544 | |
| min | -2.132357 | -2.966267 | -1.484151 | -3.159611 | -2.059948 | -2.966621 | -1.373505 | |
| 25% | 0.044654 | -1.759060 | -0.163881 | -1.758192 | -0.424513 | -1.692309 | -0.147725 | |
| 50% | 0.503217 | -1.366971 | 0.334118 | -1.369328 | -0.168328 | -1.264066 | 0.495549 | |
| 75% | 1.005745 | -0.923810 | 0.744684 | -0.997046 | 0.173343 | -0.893971 | 1.038797 | |
| max | 1.872943 | 0.326246 | 1.878712 | -0.122230 | 1.491930 | 0.182717 | 1.610699 | |
| review_score | count | 24601.000000 | 5768.000000 | 8954.000000 | 15550.000000 | 623.000000 | 2196.000000 | 38396.000000 |
| mean | -0.334334 | 0.628121 | -1.489469 | 0.126723 | -1.968694 | -1.834803 | 0.552761 | |
| std | 1.083326 | 0.227321 | 0.843669 | 0.791739 | 0.535046 | 0.561812 | 0.290950 | |
| min | -2.297030 | -1.550486 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -1.177214 | |
| 25% | -0.803942 | 0.689147 | -2.297030 | -0.057397 | -2.297030 | -2.297030 | 0.689147 | |
| 50% | -0.057397 | 0.689147 | -1.550486 | 0.689147 | -2.297030 | -2.297030 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -0.803942 | 0.689147 | -1.550486 | -1.550486 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.057397 | 0.689147 | -0.803942 | -0.803942 | 0.689147 | |
| repeat_customer | count | 24601.000000 | 5768.000000 | 8954.000000 | 15550.000000 | 623.000000 | 2196.000000 | 38396.000000 |
| mean | 0.046949 | 0.003121 | 0.011168 | 0.034019 | 0.003210 | 0.005009 | 0.030758 | |
| std | 0.211535 | 0.055781 | 0.105094 | 0.181285 | 0.056614 | 0.070614 | 0.172665 | |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = []
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total']
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version1"
do_projection = True & global_do_projections
if do_projection :
all_my_projections[projection_version] = {}
if do_projection :
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
if do_projection :
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
if do_projection :
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
if do_projection :
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
import dill
dill.load_session('notebook_env.db')
model_name = "KMeans_1"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
model_dict = all_my_models["KMeans_1"]
plot_model_evaluation(model_dict)
n_clusters = 4
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = 'Version1'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| order_payment_total | recency | review_score | fraction | |
|---|---|---|---|---|
| 0 | -0.175163 | -1.210569 | 0.369466 | 0.260907 |
| 1 | 0.147798 | -0.034617 | -1.974788 | 0.162611 |
| 2 | -0.704222 | 0.631319 | 0.392874 | 0.331040 |
| 3 | 1.042115 | 0.459609 | 0.386216 | 0.245442 |
display(pd.DataFrame(preprocessor_X.named_transformers_['standardscale'].inverse_transform(km.cluster_centers_.T[1:4].T),
columns=X_scaled.columns.values[1:3], index=[i for i in range(4)]))
| recency | review_score | |
|---|---|---|
| 0 | -473.874237 | 4.571786 |
| 1 | -293.491398 | 1.431645 |
| 2 | -191.341410 | 4.603141 |
| 3 | -217.680550 | 4.594222 |
preprocessor_X.named_transformers_['log1pscale'].inverse_transform(km.cluster_centers_.T[0])
array([ 95.56021127, 124.04683619, 62.22317476, 254.84271256])
display(pd.Series(preprocessor_X.named_transformers_['log1pscale'].inverse_transform(km.cluster_centers_.T[0]),
name=X_scaled.columns.values[0], index=[i for i in range(4)]))
0 95.560211 1 124.046836 2 62.223175 3 254.842713 Name: order_payment_total, dtype: float64
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| order_payment_total | count | 25070.000000 | 15625.000000 | 31809.000000 | 23584.000000 |
| mean | -0.176852 | 0.148411 | -0.704818 | 1.040294 | |
| std | 0.787155 | 0.973395 | 0.568637 | 0.742839 | |
| min | -2.653588 | -2.519897 | -5.884510 | 0.107629 | |
| 25% | -0.714204 | -0.505065 | -1.121233 | 0.492499 | |
| 50% | -0.166656 | 0.101216 | -0.648821 | 0.822190 | |
| 75% | 0.337690 | 0.686870 | -0.237231 | 1.386748 | |
| max | 3.187162 | 4.407333 | 0.262394 | 6.011733 | |
| recency | count | 25070.000000 | 15625.000000 | 31809.000000 | 23584.000000 |
| mean | -1.211283 | -0.034113 | 0.631940 | 0.457875 | |
| std | 0.561485 | 0.898192 | 0.559593 | 0.688256 | |
| min | -2.971724 | -3.159611 | -0.879425 | -2.222140 | |
| 25% | -1.616050 | -0.493681 | 0.165404 | -0.042585 | |
| 50% | -1.160408 | 0.087129 | 0.659702 | 0.516141 | |
| 75% | -0.766559 | 0.553666 | 1.115593 | 1.025986 | |
| max | -0.141610 | 1.878712 | 1.611766 | 1.872943 | |
| review_score | count | 25070.000000 | 15625.000000 | 31809.000000 | 23584.000000 |
| mean | 0.368797 | -1.975139 | 0.392729 | 0.386851 | |
| std | 0.499389 | 0.520941 | 0.477120 | 0.485292 | |
| min | -1.550486 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -2.297030 | -0.057397 | -0.057397 | |
| 50% | 0.689147 | -2.297030 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | -1.550486 | 0.689147 | 0.689147 | |
| max | 0.689147 | -0.803942 | 0.689147 | 0.689147 |
fig, ax = plt.subplots(figsize = (10,4))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']),
palette = 'Purples_r')
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (8,4))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
my_cmap = sns.color_palette('tab10')[:4]
fig, ax = plt.subplots(figsize = (6,6))
plt.scatter(X_scaled['recency'], X_scaled['order_payment_total'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 1], km.cluster_centers_[:, 0], s=100, c=my_cmap, edgecolors='black')
plt.xlabel('recency')
plt.ylabel('order_payment_total')
Text(0, 0.5, 'order_payment_total')
n_clusters = 5
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
data = X_scaled.copy()
data['labels'] = X_labels
projection_version
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| order_payment_total | recency | review_score | fraction | |
|---|---|---|---|---|
| 0 | -0.957454 | 0.560229 | 0.371115 | 0.238469 |
| 1 | 1.626471 | -0.468529 | 0.273798 | 0.109306 |
| 2 | 0.104007 | -0.028073 | -2.000628 | 0.156617 |
| 3 | 0.442151 | 0.764435 | 0.414489 | 0.261833 |
| 4 | -0.351019 | -1.190371 | 0.369634 | 0.233775 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| order_payment_total | count | 22914.000000 | 10503.000000 | 15049.000000 | 25159.000000 | 22463.000000 |
| mean | -0.957986 | 1.631753 | 0.103725 | 0.440698 | -0.348817 | |
| std | 0.461883 | 0.776583 | 0.919864 | 0.486828 | 0.646736 | |
| min | -5.884510 | 0.469468 | -2.519897 | -0.386074 | -2.653588 | |
| 25% | -1.295877 | 1.024200 | -0.522198 | 0.069394 | -0.788683 | |
| 50% | -0.890388 | 1.449839 | 0.072368 | 0.372764 | -0.309293 | |
| 75% | -0.589977 | 2.122650 | 0.662164 | 0.719881 | 0.144228 | |
| max | -0.146770 | 6.011733 | 3.367489 | 2.455626 | 1.395398 | |
| recency | count | 22914.000000 | 10503.000000 | 15049.000000 | 25159.000000 | 22463.000000 |
| mean | 0.559756 | -0.463667 | -0.028237 | 0.763640 | -1.190574 | |
| std | 0.563784 | 0.818236 | 0.897901 | 0.503882 | 0.569797 | |
| min | -0.879425 | -2.966073 | -3.159611 | -0.391344 | -2.971724 | |
| 25% | 0.097061 | -1.001272 | -0.475640 | 0.377264 | -1.601650 | |
| 50% | 0.543935 | -0.428571 | 0.091164 | 0.804425 | -1.138545 | |
| 75% | 1.042471 | 0.059962 | 0.550320 | 1.196239 | -0.730361 | |
| max | 1.610699 | 1.609604 | 1.878712 | 1.872943 | -0.135043 | |
| review_score | count | 22914.000000 | 10503.000000 | 15049.000000 | 25159.000000 | 22463.000000 |
| mean | 0.371171 | 0.272398 | -2.000429 | 0.414654 | 0.369771 | |
| std | 0.497288 | 0.636206 | 0.499044 | 0.456034 | 0.500918 | |
| min | -2.297030 | -2.297030 | -2.297030 | -1.177214 | -1.550486 | |
| 25% | -0.057397 | -0.057397 | -2.297030 | -0.057397 | -0.057397 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -1.550486 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.803942 | 0.689147 | 0.689147 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
n_clusters = 10
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = 'Version1'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| order_payment_total | recency | review_score | fraction | |
|---|---|---|---|---|
| 0 | 1.213875 | 0.258268 | -1.919661 | 0.058301 |
| 1 | -0.881183 | -1.186717 | 0.469738 | 0.124001 |
| 2 | 0.203569 | -0.175222 | 0.550857 | 0.144680 |
| 3 | -0.128471 | -1.446078 | -1.677336 | 0.046988 |
| 4 | 0.744005 | -1.459412 | 0.430900 | 0.090552 |
| 5 | -1.069091 | 0.718011 | 0.551624 | 0.156846 |
| 6 | 0.353919 | 1.036403 | 0.498670 | 0.163527 |
| 7 | 1.973294 | 0.321152 | 0.419410 | 0.064066 |
| 8 | -0.521847 | 0.425932 | -2.143156 | 0.063910 |
| 9 | -0.409777 | 0.317329 | -0.507506 | 0.087128 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| order_payment_total | count | 5602.000000 | 11915.000000 | 13902.000000 | 4515.000000 | 8701.000000 | 15071.000000 | 15713.000000 | 6156.000000 | 6141.000000 | 8372.000000 |
| mean | 1.213774 | -0.881050 | 0.203197 | -0.129139 | 0.744187 | -1.070069 | 0.353529 | 1.972968 | -0.521847 | -0.404649 | |
| std | 0.709645 | 0.471897 | 0.432928 | 0.732511 | 0.606332 | 0.436214 | 0.436532 | 0.661104 | 0.596247 | 0.541730 | |
| min | 0.233627 | -2.880980 | -0.767511 | -2.431793 | -0.318691 | -5.884510 | -0.507764 | 0.990410 | -5.884510 | -2.519897 | |
| 25% | 0.699171 | -1.221434 | -0.119716 | -0.629874 | 0.274458 | -1.363765 | 0.006121 | 1.455980 | -0.919391 | -0.747611 | |
| 50% | 1.033158 | -0.820589 | 0.193248 | -0.104983 | 0.654130 | -1.028219 | 0.332143 | 1.807224 | -0.449863 | -0.413797 | |
| 75% | 1.527617 | -0.501868 | 0.521129 | 0.354271 | 1.099738 | -0.707351 | 0.680886 | 2.340276 | -0.036448 | -0.030067 | |
| max | 6.011733 | -0.032402 | 1.255348 | 3.166997 | 3.625928 | -0.273512 | 1.555226 | 5.163559 | 0.477627 | 0.771355 | |
| recency | count | 5602.000000 | 11915.000000 | 13902.000000 | 4515.000000 | 8701.000000 | 15071.000000 | 15713.000000 | 6156.000000 | 6141.000000 | 8372.000000 |
| mean | 0.257742 | -1.186608 | -0.176115 | -1.445949 | -1.459545 | 0.716327 | 1.036346 | 0.321345 | 0.425932 | 0.322162 | |
| std | 0.657984 | 0.533972 | 0.337790 | 0.516955 | 0.465654 | 0.505285 | 0.321864 | 0.703535 | 0.571053 | 0.521863 | |
| min | -2.223947 | -2.971724 | -1.021170 | -3.159611 | -2.970152 | -0.356044 | 0.333789 | -2.196948 | -0.871284 | -0.851563 | |
| 25% | -0.219481 | -1.563065 | -0.415575 | -1.825642 | -1.811888 | 0.309380 | 0.769154 | -0.220409 | -0.036662 | -0.117638 | |
| 50% | 0.302750 | -1.128732 | -0.187760 | -1.418970 | -1.432155 | 0.735367 | 1.061422 | 0.373291 | 0.386551 | 0.302844 | |
| 75% | 0.718468 | -0.775426 | 0.093824 | -1.042864 | -1.093577 | 1.147753 | 1.331659 | 0.859480 | 0.841572 | 0.685263 | |
| max | 1.787829 | -0.226771 | 0.497426 | -0.325414 | -0.495977 | 1.610699 | 1.872943 | 1.609604 | 1.878712 | 1.554208 | |
| review_score | count | 5602.000000 | 11915.000000 | 13902.000000 | 4515.000000 | 8701.000000 | 15071.000000 | 15713.000000 | 6156.000000 | 6141.000000 | 8372.000000 |
| mean | -1.920393 | 0.468778 | 0.550734 | -1.676481 | 0.430896 | 0.551202 | 0.500390 | 0.419177 | -2.143156 | -0.507978 | |
| std | 0.566193 | 0.380589 | 0.289218 | 0.663284 | 0.419191 | 0.297959 | 0.349904 | 0.438183 | 0.301673 | 0.366688 | |
| min | -2.297030 | -0.803942 | -0.617306 | -2.297030 | -0.803942 | -0.803942 | -0.803942 | -1.550486 | -2.297030 | -1.301638 | |
| 25% | -2.297030 | -0.057397 | 0.689147 | -2.297030 | -0.057397 | 0.689147 | 0.689147 | -0.057397 | -2.297030 | -0.803942 | |
| 50% | -2.297030 | 0.689147 | 0.689147 | -1.550486 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | -0.803942 | |
| 75% | -1.550486 | 0.689147 | 0.689147 | -0.803942 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | -0.057397 | |
| max | -0.803942 | 0.689147 | 0.689147 | -0.555094 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -1.177214 | 0.129239 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "GaussianMixture_1"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 3
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels_reduced = gm.predict(X_reduced.astype(float))
X_labels = gm.fit_predict(X_scaled)
gm.weights_
array([0.33057976, 0.40119277, 0.26822747])
gm.means_
array([[-0.015425 , -0.8559745 , 0.51220534],
[-0.03351195, 0.7402335 , 0.52743863],
[ 0.06913514, -0.05222613, -1.42017252]])
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| order_payment_total | recency | review_score | weights | |
|---|---|---|---|---|
| 0 | -0.015425 | -0.855975 | 0.512205 | 0.330580 |
| 1 | -0.033512 | 0.740234 | 0.527439 | 0.401193 |
| 2 | 0.069135 | -0.052226 | -1.420173 | 0.268227 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | |
|---|---|---|---|---|
| order_payment_total | count | 32445.000000 | 40858.000000 | 22785.000000 |
| mean | -0.019340 | -0.034800 | 0.089943 | |
| std | 0.992547 | 0.986319 | 1.029181 | |
| min | -2.880980 | -2.936358 | -5.884510 | |
| 25% | -0.713607 | -0.726363 | -0.637156 | |
| 50% | -0.082918 | -0.071078 | 0.034334 | |
| 75% | 0.566387 | 0.558951 | 0.694375 | |
| max | 5.163559 | 4.673832 | 6.011733 | |
| recency | count | 32445.000000 | 40858.000000 | 22785.000000 |
| mean | -0.940989 | 0.773298 | -0.046742 | |
| std | 0.630171 | 0.458935 | 0.968866 | |
| min | -2.971724 | -0.083200 | -3.159611 | |
| 25% | -1.420651 | 0.377107 | -0.658762 | |
| 50% | -0.853645 | 0.778441 | 0.105727 | |
| 75% | -0.364161 | 1.174953 | 0.653944 | |
| max | 0.051742 | 1.872943 | 1.878712 | |
| review_score | count | 32445.000000 | 40858.000000 | 22785.000000 |
| mean | 0.491488 | 0.507486 | -1.609884 | |
| std | 0.328538 | 0.319104 | 0.695249 | |
| min | -0.181821 | -0.244034 | -2.297030 | |
| 25% | -0.057397 | 0.689147 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | -1.550486 | |
| 75% | 0.689147 | 0.689147 | -0.803942 | |
| max | 0.689147 | 0.689147 | -0.223296 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_1"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| order_payment_total | count | 7185.000000 | 63227.000000 | 17664.000000 | 1812.000000 | 6200.000000 |
| mean | 1.132835 | -0.087265 | -0.110004 | 2.399645 | -0.810804 | |
| std | 0.947776 | 0.889765 | 0.850422 | 0.599475 | 0.528146 | |
| min | -0.273092 | -2.936358 | -2.653588 | 1.340671 | -5.884510 | |
| 25% | 0.405702 | -0.721395 | -0.702113 | 1.932373 | -1.163604 | |
| 50% | 0.865144 | -0.085565 | -0.109025 | 2.322815 | -0.742100 | |
| 75% | 1.642665 | 0.521129 | 0.444902 | 2.776282 | -0.388604 | |
| max | 6.011733 | 2.705337 | 4.536640 | 5.163559 | 0.103185 | |
| recency | count | 7185.000000 | 63227.000000 | 17664.000000 | 1812.000000 | 6200.000000 |
| mean | 0.364829 | 0.387389 | -1.507425 | -1.008062 | 0.215972 | |
| std | 0.604450 | 0.703892 | 0.468816 | 0.697527 | 0.867032 | |
| min | -1.505915 | -1.437881 | -3.159611 | -2.964532 | -2.287356 | |
| 25% | -0.129486 | -0.194180 | -1.844954 | -1.494082 | -0.274574 | |
| 50% | 0.377161 | 0.418127 | -1.493063 | -0.978587 | 0.331823 | |
| 75% | 0.794226 | 0.985022 | -1.167292 | -0.502848 | 0.886491 | |
| max | 1.787829 | 1.872943 | 0.151303 | 0.679949 | 1.878712 | |
| review_score | count | 7185.000000 | 63227.000000 | 17664.000000 | 1812.000000 | 6200.000000 |
| mean | -1.877036 | 0.400240 | -0.010272 | 0.505807 | -2.024923 | |
| std | 0.741531 | 0.469827 | 0.929022 | 0.356818 | 0.471088 | |
| min | -2.297030 | -1.177214 | -2.297030 | -1.550486 | -2.297030 | |
| 25% | -2.297030 | -0.057397 | -0.803942 | 0.689147 | -2.297030 | |
| 50% | -2.297030 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | |
| 75% | -1.550486 | 0.689147 | 0.689147 | 0.689147 | -1.550486 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.430670 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "DBSCAN_1"
eps_list = np.linspace(0.2, 0.5, 20)
e_list = []
nclusters_list = []
noisepoints_list = []
silhouette_list = []
c_h_list = []
d_b_list = []
X_db=X_reduced
for e in eps_list :
db = DBSCAN(eps=e)
X_labels= db.fit_predict(X_db)
#core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(X_labels)-set({-1}))
n_noise_ = list(labels).count(-1)
print(e, n_clusters_, n_noise_)
if n_clusters_ > 1:
e_list.append(e)
nclusters_list.append(n_clusters_)
noisepoints_list.append(n_noise_)
silhouette_list.append(metrics.silhouette_score(X_db, X_labels))
c_h_list.append(metrics.calinski_harabasz_score(X_db, X_labels))
d_b_list.append(metrics.davies_bouldin_score(X_db, X_labels))
0.2 35 548 0.21578947368421053 30 486 0.23157894736842105 29 431 0.2473684210526316 25 373 0.26315789473684215 25 327 0.2789473684210526 24 287 0.2947368421052632 21 273 0.3105263157894737 18 241 0.3263157894736842 15 213 0.34210526315789475 12 200 0.35789473684210527 10 186 0.37368421052631584 6 172 0.3894736842105263 5 126 0.4052631578947369 4 106 0.4210526315789474 4 99 0.4368421052631579 4 90 0.45263157894736844 3 83 0.46842105263157896 2 81 0.4842105263157895 3 69 0.5 3 61
plt.figure(figsize=(16, 16))
ax1 = plt.subplot(3,2,1)
ax2 = plt.subplot(3,2,2)
ax3 = plt.subplot(3,2,3)
ax4 = plt.subplot(3,2,4)
ax5 = plt.subplot(3,1,3)
axs = [ax1, ax2, ax3, ax4, ax5]
#the smaller the better
axs[0].plot(e_list, noisepoints_list)
axs[0].set_title("Number of Noise Points", y=0.9)
#The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering.
#Scores around zero indicate overlapping clusters.
axs[1].plot(e_list, silhouette_list)
axs[1].set_title("Silouhette", y=0.9)
#Zero is the lowest possible score. Values closer to zero indicate a better partition.
axs[2].plot(e_list, d_b_list)
axs[2].set_title("Davies Bouldin", y=0.9)
axs[2].set_xlabel("Eps value")
#The score is higher when clusters are dense and well separated,
#which relates to a standard concept of a cluster
axs[3].plot(e_list, c_h_list)
axs[3].set_title("Calinski Harabasz", y=0.9)
axs[3].set_xlabel("Eps value")
#The score is higher when clusters are dense and well separated,
#which relates to a standard concept of a cluster
axs[4].plot(e_list, nclusters_list)
axs[4].set_title("Number of Clusters", y=0.9)
axs[4].set_xlabel("Eps value")
fig.tight_layout()
for ms in np.linspace(10,60,10):
e=0.4
db = DBSCAN(eps=e, min_samples=ms)
X_labels = db.fit_predict(X_scaled)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(X_labels)-set({-1}))
n_noise_ = list(labels).count(-1)
print(ms, n_clusters_, n_noise_)
10.0 5 128 15.555555555555555 4 216 21.11111111111111 3 341 26.666666666666664 3 438 32.22222222222222 2 590 37.77777777777778 2 702 43.33333333333333 2 863 48.888888888888886 1 975 54.44444444444444 1 1098 60.0 1 1197
e=0.4
ms=15
db = DBSCAN(eps=e, min_samples=ms)
X_labels = db.fit_predict(X_scaled)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(X_labels)-set({-1}))
n_noise_ = list(labels).count(-1)
print(ms, n_clusters_, n_noise_)
15 4 202
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | -1 | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|---|
| order_payment_total | count | 202.000000 | 95648.000000 | 1.300000e+02 | 7.700000e+01 | 3.100000e+01 |
| mean | 2.101067 | -0.004484 | -1.590189e-02 | 2.282415e-01 | -3.560479e-01 | |
| std | 2.309844 | 0.991150 | 8.752545e-01 | 7.971019e-01 | 4.218048e-01 | |
| min | -5.884510 | -2.716282 | -1.837565e+00 | -1.240714e+00 | -1.108606e+00 | |
| 25% | 0.585196 | -0.700339 | -6.916858e-01 | -2.839134e-01 | -6.674603e-01 | |
| 50% | 2.944570 | -0.050595 | -4.253571e-02 | 1.183128e-01 | -3.948354e-01 | |
| 75% | 3.937432 | 0.587363 | 5.952231e-01 | 7.578185e-01 | 9.964897e-03 | |
| max | 6.011733 | 4.530821 | 1.797086e+00 | 1.775393e+00 | 3.826541e-01 | |
| recency | count | 202.000000 | 95648.000000 | 1.300000e+02 | 7.700000e+01 | 3.100000e+01 |
| mean | -1.403891 | 0.010308 | -2.948446e+00 | -2.956782e+00 | -2.947294e+00 | |
| std | 1.579339 | 0.986625 | 1.333637e-02 | 4.132982e-02 | 1.386490e-02 | |
| min | -2.971724 | -2.360281 | -2.970209e+00 | -3.159611e+00 | -2.966140e+00 | |
| 25% | -2.944094 | -0.697903 | -2.960640e+00 | -2.963856e+00 | -2.959159e+00 | |
| 50% | -1.831560 | 0.130585 | -2.947415e+00 | -2.953064e+00 | -2.951268e+00 | |
| 75% | -0.254846 | 0.813772 | -2.938047e+00 | -2.940530e+00 | -2.932299e+00 | |
| max | 1.609604 | 1.878712 | -2.925777e+00 | -2.850174e+00 | -2.926097e+00 | |
| review_score | count | 202.000000 | 95648.000000 | 1.300000e+02 | 7.700000e+01 | 3.100000e+01 |
| mean | -0.759593 | 0.002535 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 | |
| std | 0.925310 | 0.998345 | 1.114518e-16 | 4.470013e-16 | 1.410719e-17 | |
| min | -2.297030 | -2.297030 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 | |
| 25% | -1.550486 | -0.057397 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 | |
| 50% | -0.803942 | 0.689147 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 | |
| 75% | -0.057397 | 0.689147 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 | |
| max | 0.689147 | 0.689147 | 6.891468e-01 | -2.297030e+00 | -5.739745e-02 |
fig, ax = plt.subplots(figsize = (10,4))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']),
palette = 'Purples_r')
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (8,4))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
passthrough_features = []
standardscale_features = ['recency', 'review_score', 'order_payment_total']
log1pscale_features = []
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version1b"
do_projection = True & global_do_projections
if do_projection :
all_my_projections[projection_version] = {}
if do_projection :
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
if do_projection :
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
if do_projection :
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
if do_projection :
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_1b"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
data = X_scaled.copy()
data['labels'] = X_labels
projection_version = 'Version1b'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| recency | review_score | order_payment_total | fraction | |
|---|---|---|---|---|
| 0 | -1.006000 | 0.406474 | -0.128960 | 0.339824 |
| 1 | 0.764453 | 0.439676 | -0.130505 | 0.451836 |
| 2 | -0.017815 | -1.811025 | -0.043386 | 0.184955 |
| 3 | -0.012688 | -0.078919 | 4.734463 | 0.023385 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| recency | count | 32653.000000 | 43416.000000 | 17772.000000 | 2247.000000 |
| mean | -1.005919 | 0.764497 | -0.017812 | -0.012732 | |
| std | 0.620210 | 0.473862 | 0.857285 | 1.011390 | |
| min | -2.971724 | -0.125452 | -3.159611 | -2.964532 | |
| 25% | -1.473855 | 0.359132 | -0.417153 | -0.761179 | |
| 50% | -0.941826 | 0.784057 | 0.107493 | 0.093211 | |
| 75% | -0.457667 | 1.179439 | 0.516232 | 0.834927 | |
| max | -0.112982 | 1.872943 | 1.878712 | 1.609604 | |
| review_score | count | 32653.000000 | 43416.000000 | 17772.000000 | 2247.000000 |
| mean | 0.406500 | 0.439664 | -1.810969 | -0.078938 | |
| std | 0.453146 | 0.421807 | 0.627473 | 1.070650 | |
| min | -1.550486 | -0.803942 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -2.297030 | -0.803942 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -1.550486 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.617306 | 0.689147 | |
| order_payment_total | count | 32653.000000 | 43416.000000 | 17772.000000 | 2247.000000 |
| mean | -0.128837 | -0.130542 | -0.043254 | 4.736634 | |
| std | 0.492036 | 0.479945 | 0.570821 | 3.033200 | |
| min | -0.680725 | -0.726021 | -0.726021 | 2.207576 | |
| 25% | -0.452759 | -0.455323 | -0.428919 | 2.971247 | |
| 50% | -0.276207 | -0.266446 | -0.216854 | 3.798253 | |
| 75% | 0.015317 | 0.026180 | 0.128929 | 5.441581 | |
| max | 2.804119 | 2.504048 | 2.825485 | 60.736923 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 5)
(-5.0, 5.0)
model_name = "GaussianMixture_6b"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 8
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = 'Version1b'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| recency | review_score | order_payment_total | weights | |
|---|---|---|---|---|
| 0 | -0.100322 | -0.057397 | 1.107870 | 0.027504 |
| 1 | 0.504849 | -0.993526 | -0.131268 | 0.062824 |
| 2 | -0.150783 | -0.088094 | 1.882506 | 0.030979 |
| 3 | -0.040445 | -2.297030 | -0.002565 | 0.109209 |
| 4 | -0.721297 | -0.955393 | -0.363223 | 0.046708 |
| 5 | 0.019031 | -0.728955 | 5.951338 | 0.010268 |
| 6 | 0.025679 | 0.689147 | -0.148409 | 0.549983 |
| 7 | -0.003062 | -0.057397 | -0.263226 | 0.162526 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|---|
| recency | count | 2215.000000 | 6357.000000 | 2941.000000 | 1.049600e+04 | 4414.000000 | 767.000000 | 5.284600e+04 | 1.605200e+04 |
| mean | -0.073185 | 0.565196 | -0.148795 | -4.083867e-02 | -0.862155 | 0.014926 | 2.611297e-02 | -9.373097e-03 | |
| std | 1.017773 | 0.601727 | 1.070314 | 9.560755e-01 | 0.727690 | 1.031764 | 1.011623e+00 | 9.938869e-01 | |
| min | -2.965700 | -1.380490 | -2.966206 | -3.159611e+00 | -2.970562 | -2.959412 | -2.970209e+00 | -2.971724e+00 | |
| 25% | -0.787972 | 0.195943 | -0.976887 | -5.539514e-01 | -1.452621 | -0.664686 | -7.217006e-01 | -7.228546e-01 | |
| 50% | 0.045510 | 0.587208 | -0.076239 | 1.323568e-01 | -0.801326 | 0.182452 | 1.391415e-01 | 1.058383e-01 | |
| 75% | 0.739165 | 1.038341 | 0.782323 | 5.946948e-01 | -0.248372 | 0.845302 | 8.601313e-01 | 7.961197e-01 | |
| max | 1.528483 | 1.773861 | 1.563815 | 1.878712e+00 | 0.380219 | 1.609604 | 1.872943e+00 | 1.611766e+00 | |
| review_score | count | 2215.000000 | 6357.000000 | 2941.000000 | 1.049600e+04 | 4414.000000 | 767.000000 | 5.284600e+04 | 1.605200e+04 |
| mean | -0.057397 | -1.009320 | 0.005257 | -2.297030e+00 | -0.951903 | -0.857962 | 6.891468e-01 | -5.739745e-02 | |
| std | 0.000000 | 0.363563 | 0.814933 | 8.882207e-16 | 0.308113 | 1.301531 | 1.110234e-16 | 2.081733e-17 | |
| min | -0.057397 | -2.110394 | -2.110394 | -2.297030e+00 | -2.110394 | -2.297030 | 6.891468e-01 | -5.739745e-02 | |
| 25% | -0.057397 | -1.550486 | -0.803942 | -2.297030e+00 | -0.803942 | -2.297030 | 6.891468e-01 | -5.739745e-02 | |
| 50% | -0.057397 | -0.803942 | 0.689147 | -2.297030e+00 | -0.803942 | -0.803942 | 6.891468e-01 | -5.739745e-02 | |
| 75% | -0.057397 | -0.803942 | 0.689147 | -2.297030e+00 | -0.803942 | 0.689147 | 6.891468e-01 | -5.739745e-02 | |
| max | -0.057397 | 0.315875 | 0.689147 | -2.297030e+00 | 0.129239 | 0.689147 | 6.891468e-01 | -5.739745e-02 | |
| order_payment_total | count | 2215.000000 | 6357.000000 | 2941.000000 | 1.049600e+04 | 4414.000000 | 767.000000 | 5.284600e+04 | 1.605200e+04 |
| mean | 1.377704 | -0.122160 | 2.036620 | -2.276110e-03 | -0.373722 | 7.148938 | -1.487427e-01 | -2.625231e-01 | |
| std | 1.115093 | 0.324543 | 1.417000 | 6.283780e-01 | 0.173693 | 4.066855 | 4.389520e-01 | 2.551355e-01 | |
| min | 0.370445 | -0.726021 | -0.637903 | -7.260214e-01 | -0.664037 | 2.417504 | -6.828842e-01 | -6.707843e-01 | |
| 25% | 0.635161 | -0.396443 | 0.907703 | -4.173928e-01 | -0.512326 | 4.569796 | -4.567178e-01 | -4.696724e-01 | |
| 50% | 0.936986 | -0.160425 | 2.086126 | -1.996266e-01 | -0.406856 | 6.590878 | -2.747230e-01 | -3.158360e-01 | |
| 75% | 1.742783 | 0.095339 | 2.932451 | 1.627886e-01 | -0.265119 | 8.410960 | 1.418126e-02 | -9.397652e-02 | |
| max | 6.249154 | 0.793585 | 5.677306 | 3.111882e+00 | 0.202889 | 60.736923 | 1.940836e+00 | 4.426403e-01 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
n_clusters = 4
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = 'Version1b'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| recency | review_score | order_payment_total | weights | |
|---|---|---|---|---|
| 0 | -0.836031 | 0.525753 | -0.232373 | 0.297304 |
| 1 | 0.756941 | 0.543516 | -0.212401 | 0.352943 |
| 2 | -0.061249 | -0.231436 | 1.797299 | 0.110215 |
| 3 | -0.049476 | -1.346896 | -0.225597 | 0.239537 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| recency | count | 29991.000000 | 37017.000000 | 9208.000000 | 19872.000000 |
| mean | -0.920728 | 0.786733 | -0.069778 | -0.043599 | |
| std | 0.635717 | 0.452467 | 1.014689 | 0.964666 | |
| min | -2.971724 | -0.161906 | -2.966621 | -3.159611 | |
| 25% | -1.406075 | 0.399004 | -0.768236 | -0.650466 | |
| 50% | -0.834240 | 0.789164 | 0.041220 | 0.109862 | |
| 75% | -0.337822 | 1.182393 | 0.752978 | 0.651811 | |
| max | 0.080439 | 1.872943 | 1.655003 | 1.878712 | |
| review_score | count | 29991.000000 | 37017.000000 | 9208.000000 | 19872.000000 |
| mean | 0.490414 | 0.515565 | -0.209070 | -1.603640 | |
| std | 0.329102 | 0.314039 | 1.094379 | 0.695191 | |
| min | -0.181821 | -0.244034 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | 0.689147 | -0.803942 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | -0.057397 | -1.550486 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | -0.803942 | |
| max | 0.689147 | 0.689147 | 0.689147 | -0.206706 | |
| order_payment_total | count | 29991.000000 | 37017.000000 | 9208.000000 | 19872.000000 |
| mean | -0.237931 | -0.214334 | 2.099749 | -0.214608 | |
| std | 0.293887 | 0.316192 | 2.165001 | 0.295105 | |
| min | -0.680725 | -0.682884 | 0.397254 | -0.726021 | |
| 25% | -0.466389 | -0.464545 | 0.929508 | -0.451146 | |
| 50% | -0.312372 | -0.287408 | 1.367390 | -0.278771 | |
| 75% | -0.069203 | -0.028675 | 2.426849 | -0.025976 | |
| max | 0.775683 | 0.878195 | 60.736923 | 0.690038 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
model_name = "BIRCH_1b"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projection_version = 'Version1b'
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| recency | count | 323.000000 | 92480.000000 | 3279.000000 | 1.000000 | 5.000000 |
| mean | -0.155455 | 0.015767 | -0.428812 | -0.618709 | -0.244189 | |
| std | 1.142938 | 0.993466 | 1.069400 | NaN | 1.557413 | |
| min | -2.222140 | -3.159611 | -2.964532 | -0.618709 | -2.110190 | |
| 25% | -1.258269 | -0.683953 | -1.381155 | -0.618709 | -1.452399 | |
| 50% | -0.200658 | 0.138654 | -0.385719 | -0.618709 | -0.254817 | |
| 75% | 0.867747 | 0.816473 | 0.454014 | -0.618709 | 1.265180 | |
| max | 1.609604 | 1.878712 | 1.537035 | -0.618709 | 1.331280 | |
| review_score | count | 323.000000 | 92480.000000 | 3279.000000 | 1.000000 | 5.000000 |
| mean | -0.131358 | 0.001298 | -0.021728 | -2.297030 | -0.803942 | |
| std | 1.146040 | 0.998221 | 1.031839 | NaN | 1.493088 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.803942 | -0.057397 | -0.803942 | -2.297030 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | -2.297030 | -0.803942 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | -2.297030 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | -2.297030 | 0.689147 | |
| order_payment_total | count | 323.000000 | 92480.000000 | 3279.000000 | 1.000000 | 5.000000 |
| mean | 9.544406 | -0.140666 | 2.963149 | 60.736923 | 29.802495 | |
| std | 2.825472 | 0.449230 | 1.425874 | NaN | 1.983618 | |
| min | 5.935859 | -0.726021 | 0.700564 | 60.736923 | 26.629597 | |
| 25% | 7.829823 | -0.451410 | 2.014560 | 60.736923 | 29.531437 | |
| 50% | 8.640725 | -0.267076 | 2.691890 | 60.736923 | 30.411048 | |
| 75% | 10.372014 | 0.022199 | 3.732288 | 60.736923 | 30.442985 | |
| max | 20.907513 | 2.511470 | 7.814192 | 60.736923 | 31.997407 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5,15)
(-5.0, 15.0)
passthrough_features = []
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total']
logscale_features = ['customer_zip_code_density_3digits']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version2"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_2"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
model_dict = all_my_models["KMeans_2"]
plot_model_evaluation(model_dict, figsize=(8,6))
n_clusters = 5
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = "Version2"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | fraction | |
|---|---|---|---|---|---|
| 0 | 0.204755 | -0.234491 | -1.292046 | 0.355828 | 0.211691 |
| 1 | 0.558633 | -0.668789 | 0.645685 | 0.388976 | 0.275685 |
| 2 | -1.380094 | -0.032353 | 0.177535 | 0.326415 | 0.181979 |
| 3 | 0.283043 | 1.205405 | 0.375013 | 0.367372 | 0.181063 |
| 4 | 0.017253 | 0.147765 | -0.032625 | -2.061121 | 0.149582 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 20341.000000 | 26490.000000 | 17486.000000 | 17398.000000 | 14373.000000 |
| mean | 0.201597 | 0.559693 | -1.379230 | 0.283884 | 0.017481 | |
| std | 0.777712 | 0.630849 | 0.648630 | 0.725778 | 0.948472 | |
| min | -2.692599 | -1.105850 | -5.508351 | -2.924617 | -3.346208 | |
| 25% | -0.312395 | 0.146006 | -1.726095 | -0.231333 | -0.592951 | |
| 50% | 0.228536 | 0.533904 | -1.286421 | 0.273484 | 0.146006 | |
| 75% | 0.695075 | 1.008102 | -0.913769 | 0.729169 | 0.657063 | |
| max | 2.029172 | 2.029172 | -0.224855 | 2.029172 | 2.029172 | |
| order_payment_total | count | 20341.000000 | 26490.000000 | 17486.000000 | 17398.000000 | 14373.000000 |
| mean | -0.232703 | -0.670178 | -0.033243 | 1.203687 | 0.147914 | |
| std | 0.787353 | 0.615921 | 0.770929 | 0.772290 | 0.991251 | |
| min | -2.597484 | -5.884510 | -2.653588 | 0.120970 | -2.519897 | |
| 25% | -0.788472 | -1.126694 | -0.584355 | 0.613595 | -0.539397 | |
| 50% | -0.233294 | -0.625599 | -0.018972 | 0.979061 | 0.103081 | |
| 75% | 0.300808 | -0.163961 | 0.473543 | 1.584011 | 0.711543 | |
| max | 3.187162 | 0.624266 | 3.371884 | 6.011733 | 4.407333 | |
| recency | count | 20341.000000 | 26490.000000 | 17486.000000 | 17398.000000 | 14373.000000 |
| mean | -1.293644 | 0.643851 | 0.181155 | 0.376393 | -0.031849 | |
| std | 0.550597 | 0.569692 | 0.808881 | 0.745993 | 0.918605 | |
| min | -3.102527 | -0.894574 | -2.970152 | -2.241105 | -3.159611 | |
| 25% | -1.708398 | 0.178062 | -0.325223 | -0.174311 | -0.521888 | |
| 50% | -1.271299 | 0.685998 | 0.209206 | 0.451697 | 0.104364 | |
| 75% | -0.861332 | 1.136172 | 0.815512 | 0.983273 | 0.582001 | |
| max | -0.124464 | 1.611766 | 1.872943 | 1.609604 | 1.878712 | |
| review_score | count | 20341.000000 | 26490.000000 | 17486.000000 | 17398.000000 | 14373.000000 |
| mean | 0.355149 | 0.389054 | 0.326247 | 0.367552 | -2.061473 | |
| std | 0.518866 | 0.486002 | 0.543205 | 0.514289 | 0.435628 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -0.057397 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -1.550486 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.803942 |
fig, ax = plt.subplots(figsize = (14,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
fig, ax = plt.subplots(figsize = (12,5))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='Feature', ylabel='value'>
X_scaled
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | |
|---|---|---|---|---|
| 0 | -0.138157 | 0.357231 | -1.505408 | -0.057397 |
| 1 | 0.620478 | 1.384383 | 0.067271 | 0.689147 |
| 2 | -0.340884 | 0.445768 | 0.893939 | 0.689147 |
| 3 | 0.492626 | 0.562666 | 0.457148 | 0.689147 |
| 4 | 2.029172 | 1.029390 | 1.355099 | 0.689147 |
| ... | ... | ... | ... | ... |
| 96084 | 0.232703 | -0.266116 | 0.620044 | -0.057397 |
| 96085 | 1.125515 | 0.196904 | 0.598459 | 0.689147 |
| 96086 | -0.193055 | -0.832794 | 0.627757 | -2.297030 |
| 96087 | -2.081436 | 2.320878 | -0.388981 | 0.689147 |
| 96088 | 1.125515 | -1.980004 | -0.090914 | 0.689147 |
96088 rows × 4 columns
fig, ax = plt.subplots(figsize = (6,6))
plt.scatter(X_scaled['customer_zip_code_density_3digits'], X_scaled['order_payment_total'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=100, c=sns.color_palette('tab10')[:5], edgecolors='black')
plt.xlabel('customer_zip_code_density_3digits')
plt.ylabel('order_payment_total')
Text(0, 0.5, 'order_payment_total')
fig, ax = plt.subplots(figsize = (6,6))
plt.scatter(X_scaled['customer_zip_code_density_3digits'], X_scaled['recency'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 2], s=100, c=sns.color_palette('tab10')[:5], edgecolors='black')
plt.xlabel('customer_zip_code_density_3digits')
plt.ylabel('recency')
Text(0, 0.5, 'recency')
fig, ax = plt.subplots(figsize = (5,6))
plt.scatter(X_scaled['recency'], X_scaled['review_score'], c=X_labels,
cmap=ListedColormap(sns.color_palette('tab10'), N=len(set(X_labels))), s=1)
plt.scatter(km.cluster_centers_[:, 2], km.cluster_centers_[:, 3], s=100, c=sns.color_palette('tab10')[:5], edgecolors='black')
plt.xlabel('recency')
plt.ylabel('review_score')
Text(0, 0.5, 'review_score')
model_name = "GaussianMixture_2"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 3
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version2"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | weights | |
|---|---|---|---|---|---|
| 0 | -0.094469 | -0.004275 | -0.890011 | 0.507531 | 0.315101 |
| 1 | -0.016072 | 0.067368 | -0.049291 | -1.399894 | 0.271958 |
| 2 | 0.082671 | -0.041105 | 0.711600 | 0.534673 | 0.412941 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | |
|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 31378.000000 | 22787.000000 | 41923.000000 |
| mean | -0.089918 | -0.013755 | 0.074777 | |
| std | 1.034193 | 1.006067 | 0.964184 | |
| min | -5.508351 | -5.508351 | -5.508351 | |
| 25% | -0.773362 | -0.659443 | -0.496363 | |
| 50% | 0.031420 | 0.141475 | 0.220151 | |
| 75% | 0.657063 | 0.665326 | 0.705685 | |
| max | 2.029172 | 2.029172 | 2.029172 | |
| order_payment_total | count | 31378.000000 | 22787.000000 | 41923.000000 |
| mean | -0.006139 | 0.090010 | -0.044329 | |
| std | 0.997232 | 1.029161 | 0.982675 | |
| min | -2.880980 | -5.884510 | -2.936358 | |
| 25% | -0.708094 | -0.637156 | -0.740065 | |
| 50% | -0.067156 | 0.034444 | -0.079918 | |
| 75% | 0.580245 | 0.694794 | 0.549263 | |
| max | 5.163559 | 6.011733 | 4.673832 | |
| recency | count | 31378.000000 | 22787.000000 | 41923.000000 |
| mean | -0.966113 | -0.046685 | 0.748480 | |
| std | 0.626211 | 0.968844 | 0.477844 | |
| min | -2.971724 | -3.159611 | -0.254655 | |
| 25% | -1.444874 | -0.658636 | 0.341197 | |
| 50% | -0.890693 | 0.105763 | 0.758173 | |
| 75% | -0.415815 | 0.653946 | 1.160143 | |
| max | 0.918898 | 1.878712 | 1.872943 | |
| review_score | count | 31378.000000 | 22787.000000 | 41923.000000 |
| mean | 0.482203 | -1.609762 | 0.514064 | |
| std | 0.333326 | 0.695340 | 0.315067 | |
| min | -0.181821 | -2.297030 | -0.244034 | |
| 25% | -0.057397 | -2.297030 | 0.689147 | |
| 50% | 0.689147 | -1.550486 | 0.689147 | |
| 75% | 0.689147 | -0.803942 | 0.689147 | |
| max | 0.689147 | -0.206706 | 0.689147 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
n_clusters = 5
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version2"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | weights | |
|---|---|---|---|---|---|
| 0 | -0.150620 | -0.103477 | -0.186158 | -0.998245 | 0.100408 |
| 1 | 0.450515 | 0.393525 | 0.462170 | -0.787710 | 0.028548 |
| 2 | 0.012722 | -0.027255 | 0.023421 | 0.689147 | 0.568010 |
| 3 | -0.022568 | -0.030511 | -0.016949 | -0.057397 | 0.190815 |
| 4 | -0.005858 | 0.182312 | -0.040738 | -2.297030 | 0.112219 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 10894.000000 | 1494.000000 | 5.457900e+04 | 1.833800e+04 | 1.078300e+04 |
| mean | -0.093846 | 0.538481 | 1.272177e-02 | -2.253935e-02 | -5.856534e-03 | |
| std | 1.014248 | 0.676907 | 9.940009e-01 | 1.010275e+00 | 1.010505e+00 | |
| min | -5.508351 | -1.904780 | -5.508351e+00 | -5.508351e+00 | -4.366046e+00 | |
| 25% | -0.741035 | 0.163933 | -6.304240e-01 | -6.892947e-01 | -6.545612e-01 | |
| 50% | 0.016166 | 0.608956 | 1.727823e-01 | 1.277636e-01 | 1.505166e-01 | |
| 75% | 0.620478 | 1.050626 | 6.897289e-01 | 6.653264e-01 | 6.598249e-01 | |
| max | 2.029172 | 2.029172 | 2.029172e+00 | 2.029172e+00 | 2.029172e+00 | |
| order_payment_total | count | 10894.000000 | 1494.000000 | 5.457900e+04 | 1.833800e+04 | 1.078300e+04 |
| mean | -0.136235 | 1.047538 | -2.725516e-02 | -3.049465e-02 | 1.823137e-01 | |
| std | 0.865338 | 1.171040 | 9.928136e-01 | 9.806854e-01 | 1.063480e+00 | |
| min | -2.519897 | -5.884510 | -2.936358e+00 | -2.653588e+00 | -5.884510e+00 | |
| 25% | -0.759688 | 0.420948 | -7.264135e-01 | -7.089864e-01 | -5.653810e-01 | |
| 50% | -0.145758 | 1.067581 | -7.477367e-02 | -7.842029e-02 | 1.113377e-01 | |
| 75% | 0.441541 | 1.769593 | 5.637407e-01 | 5.544324e-01 | 7.926716e-01 | |
| max | 3.651798 | 5.162278 | 5.163559e+00 | 4.695665e+00 | 6.011733e+00 | |
| recency | count | 10894.000000 | 1494.000000 | 5.457900e+04 | 1.833800e+04 | 1.078300e+04 |
| mean | -0.126794 | 0.570435 | 2.342180e-02 | -1.690554e-02 | -4.073676e-02 | |
| std | 0.983550 | 0.673790 | 1.013262e+00 | 9.976743e-01 | 9.568302e-01 | |
| min | -2.970562 | -1.995107 | -2.970209e+00 | -2.971724e+00 | -3.159611e+00 | |
| 25% | -0.845679 | 0.150351 | -7.241899e-01 | -7.299511e-01 | -5.591898e-01 | |
| 50% | 0.001270 | 0.673056 | 1.378200e-01 | 9.797233e-02 | 1.323712e-01 | |
| 75% | 0.617772 | 1.109744 | 8.599007e-01 | 7.904065e-01 | 5.986851e-01 | |
| max | 1.773861 | 1.609604 | 1.872943e+00 | 1.611766e+00 | 1.878712e+00 | |
| review_score | count | 10894.000000 | 1494.000000 | 5.457900e+04 | 1.833800e+04 | 1.078300e+04 |
| mean | -0.995306 | -0.634971 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 | |
| std | 0.346558 | 0.584052 | 1.110233e-16 | 2.775633e-17 | 8.882196e-16 | |
| min | -2.110394 | -2.110394 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 | |
| 25% | -1.550486 | -0.803942 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 | |
| 50% | -0.803942 | -0.803942 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 | |
| 75% | -0.803942 | -0.430670 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 | |
| max | 0.502511 | 0.564723 | 6.891468e-01 | -5.739745e-02 | -2.297030e+00 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_2"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projection_version = "Version2"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 16243.000000 | 3108.000000 | 45624.000000 | 18044.000000 | 13069.000000 |
| mean | 0.064768 | -1.695860 | 0.368131 | -1.135660 | 0.605629 | |
| std | 0.771470 | 0.801605 | 0.756866 | 0.745165 | 0.619563 | |
| min | -3.015089 | -5.508351 | -2.161458 | -4.787637 | -2.007135 | |
| 25% | -0.400315 | -2.161458 | -0.174432 | -1.622031 | 0.203174 | |
| 50% | 0.127764 | -1.573661 | 0.372585 | -1.136434 | 0.549354 | |
| 75% | 0.549354 | -1.121030 | 0.855356 | -0.620928 | 1.008102 | |
| max | 2.029172 | -0.231333 | 2.029172 | 0.936647 | 2.029172 | |
| order_payment_total | count | 16243.000000 | 3108.000000 | 45624.000000 | 18044.000000 | 13069.000000 |
| mean | 1.289402 | 1.195830 | -0.280677 | -0.359152 | -0.411222 | |
| std | 0.839125 | 0.773534 | 0.806375 | 0.692152 | 0.612429 | |
| min | -0.488164 | -1.303596 | -5.884510 | -2.653588 | -2.597484 | |
| 25% | 0.689334 | 0.701258 | -0.886495 | -0.855781 | -0.788472 | |
| 50% | 1.170483 | 1.088732 | -0.250490 | -0.397615 | -0.362405 | |
| 75% | 1.769597 | 1.598632 | 0.339791 | 0.158771 | 0.051717 | |
| max | 6.011733 | 4.536640 | 2.561600 | 1.763554 | 2.006690 | |
| recency | count | 16243.000000 | 3108.000000 | 45624.000000 | 18044.000000 | 13069.000000 |
| mean | -0.364134 | 0.419687 | 0.544575 | -0.263390 | -1.184697 | |
| std | 0.978071 | 0.907749 | 0.617937 | 1.042716 | 0.626325 | |
| min | -2.966621 | -2.967019 | -1.466894 | -2.971724 | -3.159611 | |
| 25% | -1.047042 | 0.116779 | 0.098804 | -1.127740 | -1.617628 | |
| 50% | -0.430522 | 0.598938 | 0.582020 | -0.182538 | -1.186118 | |
| 75% | 0.288688 | 1.084857 | 1.051759 | 0.546822 | -0.727480 | |
| max | 1.609604 | 1.872943 | 1.878712 | 1.773861 | 0.431974 | |
| review_score | count | 16243.000000 | 3108.000000 | 45624.000000 | 18044.000000 | 13069.000000 |
| mean | 0.042104 | -0.040984 | 0.069488 | -0.210734 | 0.005787 | |
| std | 1.037100 | 1.017478 | 0.945543 | 1.143265 | 0.873608 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -0.057397 | -0.803942 | -0.057397 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.057397 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = []
standardscale_features = ['recency', 'review_score', 'order_payment_total', 'customer_zip_code_density_3digits']
log1pscale_features = []
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version2b"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_2b"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = "Version2b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| recency | review_score | order_payment_total | customer_zip_code_density_3digits | fraction | |
|---|---|---|---|---|---|
| 0 | 0.764790 | 0.440775 | -0.126255 | -0.184346 | 0.420979 |
| 1 | 0.025527 | 0.039063 | -0.121612 | 2.787751 | 0.069270 |
| 2 | -1.016985 | 0.408490 | -0.122578 | -0.242411 | 0.315024 |
| 3 | -0.020758 | -1.820733 | -0.038344 | -0.206399 | 0.172821 |
| 4 | -0.005022 | -0.092854 | 4.882530 | -0.140147 | 0.021907 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| recency | count | 40451.000000 | 6656.000000 | 30270.000000 | 16606.000000 | 2105.000000 |
| mean | 0.765236 | 0.025724 | -1.016394 | -0.020882 | -0.006074 | |
| std | 0.474723 | 0.942773 | 0.619070 | 0.861743 | 1.012207 | |
| min | -0.174867 | -2.970615 | -2.971724 | -3.159611 | -2.964532 | |
| 25% | 0.360075 | -0.571545 | -1.485989 | -0.422880 | -0.761123 | |
| 50% | 0.784477 | 0.092881 | -0.952012 | 0.107057 | 0.104212 | |
| 75% | 1.181060 | 0.781380 | -0.474274 | 0.516992 | 0.836148 | |
| max | 1.872943 | 1.637597 | -0.095046 | 1.878712 | 1.609604 | |
| review_score | count | 40451.000000 | 6656.000000 | 30270.000000 | 16606.000000 | 2105.000000 |
| mean | 0.440616 | 0.038313 | 0.408484 | -1.821590 | -0.092094 | |
| std | 0.422050 | 0.928230 | 0.453003 | 0.623092 | 1.080610 | |
| min | -0.803942 | -2.297030 | -1.550486 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -0.057397 | -2.297030 | -0.803942 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | -2.297030 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | -1.550486 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | -0.617306 | 0.689147 | |
| order_payment_total | count | 40451.000000 | 6656.000000 | 30270.000000 | 16606.000000 | 2105.000000 |
| mean | -0.126237 | -0.121501 | -0.122874 | -0.038297 | 4.879087 | |
| std | 0.485192 | 0.551599 | 0.503139 | 0.579245 | 3.080974 | |
| min | -0.726021 | -0.664037 | -0.680725 | -0.726021 | 2.298573 | |
| 25% | -0.451590 | -0.466940 | -0.451635 | -0.428694 | 3.062042 | |
| 50% | -0.263028 | -0.290242 | -0.272406 | -0.215033 | 3.914665 | |
| 75% | 0.029306 | 0.016458 | 0.019365 | 0.133067 | 5.612398 | |
| max | 2.557981 | 3.582297 | 2.804119 | 2.890123 | 60.736923 | |
| customer_zip_code_density_3digits | count | 40451.000000 | 6656.000000 | 30270.000000 | 16606.000000 | 2105.000000 |
| mean | -0.184570 | 2.784869 | -0.242411 | -0.206895 | -0.140900 | |
| std | 0.617990 | 0.910270 | 0.608249 | 0.641083 | 0.864506 | |
| min | -1.087434 | 1.146372 | -1.087434 | -1.080122 | -1.087434 | |
| 25% | -0.703556 | 1.928753 | -0.740116 | -0.718180 | -0.765708 | |
| 50% | -0.279462 | 3.230284 | -0.359893 | -0.337957 | -0.367205 | |
| 75% | 0.236032 | 3.244908 | 0.133665 | 0.159257 | 0.254312 | |
| max | 1.515627 | 4.052880 | 1.928753 | 1.928753 | 4.052880 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5,15)
(-5.0, 15.0)
model_name = "GaussianMixture_2b"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 3
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version2b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| recency | review_score | order_payment_total | customer_zip_code_density_3digits | weights | |
|---|---|---|---|---|---|
| 0 | -0.050573 | -0.702435 | 1.512104 | 0.822677 | 0.104466 |
| 1 | 0.025698 | 0.689147 | -0.148396 | 0.004085 | 0.549719 |
| 2 | -0.025574 | -0.883295 | -0.220890 | -0.255013 | 0.345815 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | |
|---|---|---|---|---|
| recency | count | 9014.000000 | 5.286000e+04 | 34214.000000 |
| mean | -0.052917 | 2.613021e-02 | -0.026429 | |
| std | 1.003911 | 1.011545e+00 | 0.979618 | |
| min | -3.102527 | -2.970209e+00 | -3.159611 | |
| 25% | -0.742395 | -7.213382e-01 | -0.688721 | |
| 50% | 0.073666 | 1.391873e-01 | 0.110702 | |
| 75% | 0.747642 | 8.600479e-01 | 0.729263 | |
| max | 1.655003 | 1.872943e+00 | 1.878712 | |
| review_score | count | 9014.000000 | 5.286000e+04 | 34214.000000 |
| mean | -0.677375 | 6.891468e-01 | -0.886258 | |
| std | 1.092326 | 1.110234e-16 | 0.924154 | |
| min | -2.297030 | 6.891468e-01 | -2.297030 | |
| 25% | -1.550486 | 6.891468e-01 | -1.923758 | |
| 50% | -0.057397 | 6.891468e-01 | -0.803942 | |
| 75% | -0.057397 | 6.891468e-01 | -0.057397 | |
| max | 0.689147 | 6.891468e-01 | 0.564723 | |
| order_payment_total | count | 9014.000000 | 5.286000e+04 | 34214.000000 |
| mean | 1.701787 | -1.479643e-01 | -0.219750 | |
| std | 2.443833 | 4.411754e-01 | 0.298893 | |
| min | -0.664037 | -6.828842e-01 | -0.726021 | |
| 25% | 0.071555 | -4.566503e-01 | -0.455683 | |
| 50% | 1.070671 | -2.746780e-01 | -0.287442 | |
| 75% | 2.468681 | 1.454111e-02 | -0.039065 | |
| max | 60.736923 | 2.009747e+00 | 0.750043 | |
| customer_zip_code_density_3digits | count | 9014.000000 | 5.286000e+04 | 34214.000000 |
| mean | 0.879521 | 4.453043e-03 | -0.238598 | |
| std | 1.624180 | 9.837500e-01 | 0.598016 | |
| min | -1.087434 | -1.087434e+00 | -1.087434 | |
| 25% | -0.542692 | -6.889319e-01 | -0.725492 | |
| 50% | 0.298184 | -2.282778e-01 | -0.348925 | |
| 75% | 1.928753 | 3.274318e-01 | 0.133665 | |
| max | 4.052880 | 4.052880e+00 | 1.515627 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
n_clusters = 8
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version2b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| recency | review_score | order_payment_total | customer_zip_code_density_3digits | weights | |
|---|---|---|---|---|---|
| 0 | 0.025460 | 0.689147 | -0.140827 | 0.006653 | 0.551918 |
| 1 | -0.040335 | -2.297030 | -0.001021 | 0.011410 | 0.109259 |
| 2 | -0.046821 | -1.550486 | -0.112840 | -0.141790 | 0.029210 |
| 3 | -0.007340 | -0.057397 | -0.264416 | -0.281984 | 0.146454 |
| 4 | -0.126966 | -0.987711 | -0.210333 | 3.586676 | 0.004068 |
| 5 | 0.007627 | -0.043443 | 3.451435 | -0.059905 | 0.032076 |
| 6 | -0.056329 | -0.803942 | -0.139424 | -0.135061 | 0.084365 |
| 7 | -0.051078 | -0.057397 | 0.510363 | 0.920197 | 0.042649 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|---|
| recency | count | 5.305600e+04 | 1.050400e+04 | 2802.000000 | 1.450700e+04 | 448.000000 | 3045.000000 | 8.060000e+03 | 3.666000e+03 |
| mean | 2.556380e-02 | -4.096996e-02 | -0.046634 | -4.182130e-03 | -0.131500 | 0.007351 | -5.535829e-02 | -6.871471e-02 | |
| std | 1.011588e+00 | 9.563319e-01 | 0.974650 | 9.909511e-01 | 1.013895 | 1.012242 | 9.825865e-01 | 1.020562e+00 | |
| min | -2.970209e+00 | -3.159611e+00 | -2.967066 | -2.971724e+00 | -2.958085 | -2.964532 | -2.970562e+00 | -2.965700e+00 | |
| 25% | -7.216290e-01 | -5.542384e-01 | -0.709630 | -7.157046e-01 | -0.902031 | -0.730863 | -7.355041e-01 | -8.068929e-01 | |
| 50% | 1.388545e-01 | 1.324348e-01 | 0.091256 | 1.129127e-01 | -0.071028 | 0.131957 | 8.083789e-02 | 4.154101e-02 | |
| 75% | 8.597150e-01 | 5.948303e-01 | 0.690883 | 7.961975e-01 | 0.672470 | 0.842958 | 7.042944e-01 | 7.640316e-01 | |
| max | 1.872943e+00 | 1.878712e+00 | 1.773861 | 1.611766e+00 | 1.551060 | 1.609604 | 1.590980e+00 | 1.552794e+00 | |
| review_score | count | 5.305600e+04 | 1.050400e+04 | 2802.000000 | 1.450700e+04 | 448.000000 | 3045.000000 | 8.060000e+03 | 3.666000e+03 |
| mean | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | -0.976830 | -0.042835 | -8.039417e-01 | -5.739745e-02 | |
| std | 2.220467e-16 | 8.882207e-16 | 0.000000 | 2.081740e-17 | 0.412868 | 0.984972 | 1.110292e-16 | 2.081952e-17 | |
| min | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | -1.923758 | -2.297030 | -8.039417e-01 | -5.739745e-02 | |
| 25% | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | -1.550486 | -0.803942 | -8.039417e-01 | -5.739745e-02 | |
| 50% | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | -0.803942 | 0.689147 | -8.039417e-01 | -5.739745e-02 | |
| 75% | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | -0.803942 | 0.689147 | -8.039417e-01 | -5.739745e-02 | |
| max | 6.891468e-01 | -2.297030e+00 | -1.550486 | -5.739745e-02 | 0.440299 | 0.689147 | -8.039417e-01 | -5.739745e-02 | |
| order_payment_total | count | 5.305600e+04 | 1.050400e+04 | 2802.000000 | 1.450700e+04 | 448.000000 | 3045.000000 | 8.060000e+03 | 3.666000e+03 |
| mean | -1.401388e-01 | 1.161259e-04 | -0.110871 | -2.648915e-01 | -0.222958 | 3.475841 | -1.376791e-01 | 6.036739e-01 | |
| std | 4.588611e-01 | 6.340887e-01 | 0.460981 | 2.550509e-01 | 0.334240 | 3.326165 | 4.432580e-01 | 9.704552e-01 | |
| min | -6.828842e-01 | -7.260214e-01 | -0.661743 | -6.707843e-01 | -0.641276 | -0.637903 | -7.260214e-01 | -6.523419e-01 | |
| 25% | -4.561330e-01 | -4.171791e-01 | -0.441492 | -4.698074e-01 | -0.465961 | 2.119502 | -4.529843e-01 | -1.891234e-01 | |
| 50% | -2.725189e-01 | -1.995591e-01 | -0.246092 | -3.201992e-01 | -0.325395 | 2.981211 | -2.638825e-01 | 5.155552e-01 | |
| 75% | 1.873562e-02 | 1.642505e-01 | 0.067349 | -1.025792e-01 | -0.067156 | 4.779882 | 2.858655e-02 | 1.012184e+00 | |
| max | 2.111945e+00 | 3.204723e+00 | 1.974797 | 4.832585e-01 | 1.254015 | 60.736923 | 1.882585e+00 | 4.322916e+00 | |
| customer_zip_code_density_3digits | count | 5.305600e+04 | 1.050400e+04 | 2802.000000 | 1.450700e+04 | 448.000000 | 3045.000000 | 8.060000e+03 | 3.666000e+03 |
| mean | 6.556952e-03 | 1.190165e-02 | -0.150876 | -2.683659e-01 | 3.547106 | -0.060855 | -1.561375e-01 | 1.008649e+00 | |
| std | 9.871626e-01 | 1.035456e+00 | 0.715602 | 5.700549e-01 | 0.407352 | 0.958172 | 7.291576e-01 | 1.578943e+00 | |
| min | -1.087434e+00 | -1.080122e+00 | -1.080122 | -1.083778e+00 | 2.404031 | -1.087434 | -1.087434e+00 | -1.087434e+00 | |
| 25% | -6.889319e-01 | -6.998998e-01 | -0.714524 | -7.401156e-01 | 3.230284 | -0.740116 | -7.145238e-01 | -4.512929e-01 | |
| 50% | -2.282778e-01 | -2.465577e-01 | -0.279462 | -3.672052e-01 | 3.244908 | -0.330645 | -3.379573e-01 | 9.160454e-01 | |
| 75% | 3.274318e-01 | 2.945280e-01 | 0.236032 | 1.190407e-01 | 4.052880 | 0.283560 | 2.360322e-01 | 1.928753e+00 | |
| max | 4.052880e+00 | 4.052880e+00 | 2.404031 | 1.292612e+00 | 4.052880 | 4.052880 | 3.244908e+00 | 4.052880e+00 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
model_name = "BIRCH_2b"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 6
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projection_version = "Version2b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|---|
| recency | count | 6.000000 | 4147.000000 | 87150.000000 | 79.000000 | 737.000000 | 3969.000000 |
| mean | -0.306609 | 0.493320 | -0.025473 | -0.149108 | -0.156278 | 0.076338 | |
| std | 1.401359 | 0.808830 | 1.002122 | 1.220286 | 1.022860 | 0.985167 | |
| min | -2.110190 | -2.964532 | -3.159611 | -2.189398 | -2.959412 | -2.970615 | |
| 25% | -1.243976 | 0.084997 | -0.740058 | -1.384802 | -0.815307 | -0.553660 | |
| 50% | -0.436763 | 0.673367 | 0.092583 | 0.156829 | -0.177142 | 0.208544 | |
| 75% | 0.885181 | 1.120563 | 0.784736 | 0.935746 | 0.691960 | 0.867542 | |
| max | 1.331280 | 1.563815 | 1.878712 | 1.481748 | 1.609604 | 1.637597 | |
| review_score | count | 6.000000 | 4147.000000 | 87150.000000 | 79.000000 | 737.000000 | 3969.000000 |
| mean | -1.052790 | -0.074994 | 0.013652 | -0.246396 | -0.124252 | -0.191837 | |
| std | 1.467993 | 1.020812 | 0.989638 | 1.155728 | 1.176724 | 1.130678 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -2.297030 | -0.803942 | -0.057397 | -0.803942 | -0.803942 | -0.803942 | |
| 50% | -1.550486 | 0.689147 | 0.689147 | -0.057397 | 0.689147 | 0.689147 | |
| 75% | 0.315875 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| order_payment_total | count | 6.000000 | 4147.000000 | 87150.000000 | 79.000000 | 737.000000 | 3969.000000 |
| mean | 34.958233 | 2.080495 | -0.158802 | 13.578034 | 6.328200 | -0.185061 | |
| std | 12.752945 | 1.075134 | 0.420961 | 2.723291 | 1.677760 | 0.418130 | |
| min | 26.629597 | 0.043700 | -0.726021 | 9.452370 | 3.081744 | -0.664037 | |
| 25% | 29.751340 | 1.203096 | -0.452759 | 11.573625 | 5.050715 | -0.472056 | |
| 50% | 30.427016 | 1.967060 | -0.273171 | 12.876714 | 5.929022 | -0.301397 | |
| 75% | 31.608801 | 2.787138 | 0.008480 | 14.866623 | 7.510028 | -0.019319 | |
| max | 60.736923 | 6.643506 | 3.148047 | 20.907513 | 11.570229 | 2.772092 | |
| customer_zip_code_density_3digits | count | 6.000000 | 4147.000000 | 87150.000000 | 79.000000 | 737.000000 | 3969.000000 |
| mean | 0.323776 | -0.047921 | -0.149071 | -0.241282 | -0.216345 | 3.367810 | |
| std | 0.821697 | 1.070051 | 0.698093 | 0.686609 | 0.784541 | 0.588458 | |
| min | -0.235590 | -1.087434 | -1.087434 | -1.050874 | -1.087434 | 1.515627 | |
| 25% | -0.196288 | -0.791299 | -0.707212 | -0.718180 | -0.780331 | 3.230284 | |
| 50% | 0.091621 | -0.451293 | -0.279462 | -0.462261 | -0.392797 | 3.244908 | |
| 75% | 0.291786 | 0.298184 | 0.254312 | 0.109901 | 0.078825 | 4.052880 | |
| max | 1.928753 | 4.052880 | 3.244908 | 3.230284 | 4.052880 | 4.052880 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = []
standardscale_features = ['recency', 'review_score', 'online_timeofday']
log1pscale_features = ['order_payment_total']
logscale_features = ['customer_zip_code_density_3digits']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version3"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_3"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 6
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | online_timeofday | fraction | |
|---|---|---|---|---|---|---|
| 0 | 0.409723 | -0.381903 | 0.425095 | 0.363058 | -1.059199 | 0.183509 |
| 1 | 0.154507 | -0.232914 | -1.384271 | 0.344648 | 0.072286 | 0.181833 |
| 2 | 0.017927 | 0.132243 | -0.034986 | -2.098999 | -0.110110 | 0.143192 |
| 3 | 0.151230 | 1.550934 | 0.219930 | 0.324657 | 0.035702 | 0.125083 |
| 4 | 0.491678 | -0.462321 | 0.586559 | 0.383642 | 0.845379 | 0.211098 |
| 5 | -1.471623 | -0.016153 | 0.175941 | 0.319156 | 0.089432 | 0.155285 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 17633.000000 | 17472.000000 | 13759.000000 | 12019.000000 | 20284.000000 | 14921.000000 |
| mean | 0.409645 | 0.154651 | 0.017961 | 0.151637 | 0.491502 | -1.472061 | |
| std | 0.710348 | 0.801164 | 0.954708 | 0.793369 | 0.659367 | 0.642711 | |
| min | -1.937791 | -2.841391 | -3.223741 | -3.015089 | -1.672762 | -5.508351 | |
| 25% | -0.097376 | -0.377629 | -0.611519 | -0.348130 | 0.031420 | -1.811604 | |
| 50% | 0.372585 | 0.185916 | 0.150517 | 0.203174 | 0.491003 | -1.360722 | |
| 75% | 0.855356 | 0.676243 | 0.657063 | 0.668066 | 0.923889 | -1.019153 | |
| max | 2.029172 | 2.029172 | 2.029172 | 2.029172 | 2.029172 | -0.193055 | |
| order_payment_total | count | 17633.000000 | 17472.000000 | 13759.000000 | 12019.000000 | 20284.000000 | 14921.000000 |
| mean | -0.382031 | -0.232179 | 0.132842 | 1.548641 | -0.463684 | -0.016255 | |
| std | 0.724850 | 0.790692 | 0.970205 | 0.761977 | 0.713381 | 0.763729 | |
| min | -5.884510 | -2.716282 | -5.884510 | 0.279098 | -2.936358 | -2.653588 | |
| 25% | -0.906314 | -0.800579 | -0.554554 | 0.960825 | -0.993928 | -0.574520 | |
| 50% | -0.345832 | -0.223170 | 0.101113 | 1.382934 | -0.437193 | 0.026102 | |
| 75% | 0.182121 | 0.338613 | 0.715672 | 1.990408 | 0.093415 | 0.519572 | |
| max | 1.770605 | 2.724203 | 3.702425 | 6.011733 | 1.433561 | 3.142781 | |
| recency | count | 17633.000000 | 17472.000000 | 13759.000000 | 12019.000000 | 20284.000000 | 14921.000000 |
| mean | 0.425694 | -1.384693 | -0.034721 | 0.221850 | 0.585837 | 0.175275 | |
| std | 0.693984 | 0.524445 | 0.927297 | 0.833597 | 0.621186 | 0.821097 | |
| min | -1.741352 | -3.158795 | -3.159611 | -2.943332 | -1.278862 | -2.966200 | |
| 25% | -0.130103 | -1.787682 | -0.547714 | -0.319849 | 0.091723 | -0.357935 | |
| 50% | 0.469442 | -1.380615 | 0.109440 | 0.293583 | 0.646931 | 0.208668 | |
| 75% | 0.996753 | -0.984637 | 0.587925 | 0.875100 | 1.122865 | 0.821522 | |
| max | 1.611766 | -0.155032 | 1.878712 | 1.609604 | 1.610699 | 1.872943 | |
| review_score | count | 17633.000000 | 17472.000000 | 13759.000000 | 12019.000000 | 20284.000000 | 14921.000000 |
| mean | 0.363180 | 0.344586 | -2.098894 | 0.325575 | 0.383518 | 0.319133 | |
| std | 0.507776 | 0.531401 | 0.389819 | 0.580796 | 0.498648 | 0.557031 | |
| min | -1.550486 | -2.297030 | -2.297030 | -2.297030 | -1.550486 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -2.297030 | -0.057397 | -0.057397 | -0.057397 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.803942 | 0.689147 | 0.689147 | 0.689147 | |
| online_timeofday | count | 17633.000000 | 17472.000000 | 13759.000000 | 12019.000000 | 20284.000000 | 14921.000000 |
| mean | -1.058423 | 0.070763 | -0.110434 | 0.036748 | 0.846347 | 0.089625 | |
| std | 0.642193 | 0.895248 | 0.961752 | 0.823534 | 0.635581 | 0.861350 | |
| min | -2.390579 | -2.390579 | -2.390579 | -2.390579 | -0.277462 | -2.390579 | |
| 25% | -1.464236 | -0.544486 | -0.762062 | -0.495037 | 0.342298 | -0.488444 | |
| 50% | -0.923595 | 0.055494 | -0.145598 | 0.025825 | 0.774152 | 0.078570 | |
| 75% | -0.544486 | 0.704923 | 0.540093 | 0.586246 | 1.232378 | 0.685144 | |
| max | -0.036811 | 2.353221 | 2.353221 | 2.353221 | 2.353221 | 2.353221 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "GaussianMixture_3"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| customer_zip_code_density_3digits | order_payment_total | recency | review_score | online_timeofday | weights | |
|---|---|---|---|---|---|---|
| 0 | -0.052914 | -0.064824 | -1.307144 | -0.057397 | -0.018768 | 0.049152 |
| 1 | 0.012722 | -0.027255 | 0.023422 | 0.689147 | 0.023101 | 0.568008 |
| 2 | -0.012129 | 0.088306 | -0.041794 | -1.577587 | -0.057044 | 0.241184 |
| 3 | -0.012001 | -0.018574 | 0.430793 | -0.057397 | 0.011008 | 0.141656 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 4.678000e+03 | 5.457900e+04 | 23171.000000 | 1.366000e+04 |
| mean | -5.746427e-02 | 1.272177e-02 | -0.012128 | -1.057897e-02 | |
| std | 1.036019e+00 | 9.940009e-01 | 1.005499 | 1.001066e+00 | |
| min | -4.787637e+00 | -5.508351e+00 | -5.508351 | -5.508351e+00 | |
| 25% | -7.410353e-01 | -6.304240e-01 | -0.659443 | -6.692985e-01 | |
| 50% | 3.645482e-02 | 1.727823e-01 | 0.146006 | 1.505166e-01 | |
| 75% | 6.653264e-01 | 6.897289e-01 | 0.665326 | 6.653264e-01 | |
| max | 2.029172e+00 | 2.029172e+00 | 2.029172 | 2.029172e+00 | |
| order_payment_total | count | 4.678000e+03 | 5.457900e+04 | 23171.000000 | 1.366000e+04 |
| mean | -6.144748e-02 | -2.725516e-02 | 0.088333 | -1.989455e-02 | |
| std | 9.684988e-01 | 9.928136e-01 | 1.026573 | 9.846356e-01 | |
| min | -2.653588e+00 | -2.936358e+00 | -5.884510 | -2.597484e+00 | |
| 25% | -7.415908e-01 | -7.264135e-01 | -0.637156 | -6.969943e-01 | |
| 50% | -1.049827e-01 | -7.477367e-02 | 0.033021 | -6.798709e-02 | |
| 75% | 5.042703e-01 | 5.637407e-01 | 0.693278 | 5.657620e-01 | |
| max | 4.695665e+00 | 5.163559e+00 | 6.011733 | 4.527301e+00 | |
| recency | count | 4.678000e+03 | 5.457900e+04 | 23171.000000 | 1.366000e+04 |
| mean | -1.391657e+00 | 2.342180e-02 | -0.041790 | 4.538917e-01 | |
| std | 4.583549e-01 | 1.013262e+00 | 0.968321 | 6.287828e-01 | |
| min | -2.971724e+00 | -2.970209e+00 | -3.159611 | -8.237196e-01 | |
| 25% | -1.725302e+00 | -7.241899e-01 | -0.650200 | -1.107284e-01 | |
| 50% | -1.353542e+00 | 1.378200e-01 | 0.110267 | 4.620710e-01 | |
| 75% | -1.003735e+00 | 8.599007e-01 | 0.664394 | 9.780761e-01 | |
| max | -6.364990e-01 | 1.872943e+00 | 1.878712 | 1.611766e+00 | |
| review_score | count | 4.678000e+03 | 5.457900e+04 | 23171.000000 | 1.366000e+04 |
| mean | -5.739745e-02 | 6.891468e-01 | -1.577851 | -5.739745e-02 | |
| std | 6.939636e-18 | 1.110233e-16 | 0.732143 | 1.387830e-17 | |
| min | -5.739745e-02 | 6.891468e-01 | -2.297030 | -5.739745e-02 | |
| 25% | -5.739745e-02 | 6.891468e-01 | -2.297030 | -5.739745e-02 | |
| 50% | -5.739745e-02 | 6.891468e-01 | -1.550486 | -5.739745e-02 | |
| 75% | -5.739745e-02 | 6.891468e-01 | -0.803942 | -5.739745e-02 | |
| max | -5.739745e-02 | 6.891468e-01 | 0.564723 | -5.739745e-02 | |
| online_timeofday | count | 4.678000e+03 | 5.457900e+04 | 23171.000000 | 1.366000e+04 |
| mean | -1.756736e-02 | 2.310096e-02 | -0.057063 | 1.051021e-02 | |
| std | 1.024077e+00 | 1.004687e+00 | 0.993219 | 9.803654e-01 | |
| min | -2.390579e+00 | -2.390579e+00 | -2.390579 | -2.390579e+00 | |
| 25% | -7.150851e-01 | -6.433841e-01 | -0.742282 | -6.343185e-01 | |
| 50% | -2.362427e-02 | 4.890082e-02 | -0.079666 | 1.263827e-02 | |
| 75% | 7.469548e-01 | 7.411858e-01 | 0.645585 | 6.950334e-01 | |
| max | 2.353221e+00 | 2.353221e+00 | 2.353221 | 2.353221e+00 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_3"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 6
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 34403.000000 | 26654.000000 | 12744.000000 | 4695.000000 | 10302.000000 | 7290.000000 |
| mean | 0.531041 | 0.052024 | -1.072606 | -0.366349 | -0.094705 | -0.451450 | |
| std | 0.691372 | 0.917891 | 0.875511 | 0.902191 | 1.005403 | 0.947460 | |
| min | -1.971885 | -4.366046 | -5.508351 | -3.346208 | -4.066923 | -4.066923 | |
| 25% | 0.146006 | -0.513270 | -1.622031 | -1.019153 | -0.773362 | -1.061598 | |
| 50% | 0.537013 | 0.163933 | -1.061598 | -0.291529 | -0.009765 | -0.370175 | |
| 75% | 1.008102 | 0.676243 | -0.407989 | 0.277475 | 0.634702 | 0.220151 | |
| max | 2.029172 | 2.029172 | 1.089657 | 2.029172 | 2.029172 | 2.029172 | |
| order_payment_total | count | 34403.000000 | 26654.000000 | 12744.000000 | 4695.000000 | 10302.000000 | 7290.000000 |
| mean | -0.037525 | -0.376603 | -0.151227 | 1.050997 | -0.319907 | 1.593612 | |
| std | 0.928228 | 0.730495 | 0.716792 | 0.947871 | 0.819435 | 0.881867 | |
| min | -5.884510 | -2.643283 | -2.363585 | -1.256602 | -2.653588 | -0.258071 | |
| 25% | -0.718494 | -0.898903 | -0.650524 | 0.381619 | -0.913361 | 0.883282 | |
| 50% | 0.024559 | -0.369791 | -0.122659 | 0.937125 | -0.366017 | 1.482165 | |
| 75% | 0.603144 | 0.117059 | 0.342572 | 1.599523 | 0.238747 | 2.187870 | |
| max | 3.367345 | 2.652130 | 3.142781 | 6.011733 | 3.001522 | 5.163559 | |
| recency | count | 34403.000000 | 26654.000000 | 12744.000000 | 4695.000000 | 10302.000000 | 7290.000000 |
| mean | -0.094117 | 0.459858 | 0.258481 | -0.224286 | -1.377175 | 0.401572 | |
| std | 0.962342 | 0.672621 | 0.845972 | 1.085216 | 0.543365 | 0.961413 | |
| min | -3.102527 | -1.949781 | -3.159611 | -2.976835 | -2.971724 | -2.967019 | |
| 25% | -0.773393 | -0.051745 | -0.251875 | -1.096425 | -1.778749 | -0.148526 | |
| 50% | -0.038059 | 0.488379 | 0.365701 | -0.136641 | -1.406583 | 0.692468 | |
| 75% | 0.656541 | 1.023223 | 0.895766 | 0.655075 | -1.003854 | 1.146727 | |
| max | 1.878712 | 1.759115 | 1.872943 | 1.787829 | 1.003771 | 1.609604 | |
| review_score | count | 34403.000000 | 26654.000000 | 12744.000000 | 4695.000000 | 10302.000000 | 7290.000000 |
| mean | 0.189499 | -0.017788 | -0.162981 | -1.944032 | 0.200503 | 0.424344 | |
| std | 0.798029 | 1.053957 | 1.064926 | 0.567059 | 0.761841 | 0.465560 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -1.550486 | |
| 25% | -0.057397 | -0.057397 | -0.803942 | -2.297030 | -0.057397 | -0.057397 | |
| 50% | 0.689147 | 0.689147 | -0.057397 | -2.297030 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | -1.550486 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | -0.430670 | 0.689147 | 0.689147 | |
| online_timeofday | count | 34403.000000 | 26654.000000 | 12744.000000 | 4695.000000 | 10302.000000 | 7290.000000 |
| mean | 0.573942 | -0.659052 | 0.660587 | -0.110847 | -0.758369 | -0.310609 | |
| std | 0.811196 | 0.766671 | 0.716767 | 0.958747 | 0.808033 | 0.831521 | |
| min | -2.387283 | -2.390579 | -2.390579 | -2.387283 | -2.390579 | -2.390579 | |
| 25% | 0.012638 | -1.124687 | 0.252466 | -0.745579 | -1.233475 | -0.811510 | |
| 50% | 0.569763 | -0.600528 | 0.724703 | -0.129115 | -0.729096 | -0.297242 | |
| 75% | 1.110404 | -0.099446 | 1.110404 | 0.530204 | -0.258507 | 0.207137 | |
| max | 2.353221 | 1.641156 | 2.353221 | 2.353221 | 2.340034 | 2.336738 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = []
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total', 'order_avg_shipping_distance']
logscale_features = ['customer_zip_code_density_3digits']
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version4"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_4"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 6
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| customer_zip_code_density_3digits | order_payment_total | order_avg_shipping_distance | recency | review_score | fraction | |
|---|---|---|---|---|---|---|
| 0 | -1.403372 | -0.045490 | 0.701381 | 0.188683 | 0.304423 | 0.172488 |
| 1 | 0.141693 | 1.506091 | 0.258837 | 0.221355 | 0.335285 | 0.127394 |
| 2 | 0.465876 | -0.388375 | 0.173353 | 0.678870 | 0.390131 | 0.232974 |
| 3 | 0.125014 | -0.204608 | 0.231709 | -1.313950 | 0.352284 | 0.183769 |
| 4 | 0.638096 | -0.555701 | -1.854060 | 0.207098 | 0.288892 | 0.143348 |
| 5 | 0.006556 | 0.167780 | 0.207116 | -0.047609 | -2.086384 | 0.140028 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 16574.000000 | 12241.000000 | 22386.000000 | 17658.000000 | 13774.000000 | 13455.000000 |
| mean | -1.403588 | 0.138510 | 0.465471 | 0.128729 | 0.638008 | 0.006430 | |
| std | 0.654013 | 0.780657 | 0.634902 | 0.788786 | 0.651813 | 0.946076 | |
| min | -5.508351 | -3.015089 | -1.121030 | -2.841391 | -2.841391 | -3.346208 | |
| 25% | -1.726095 | -0.348130 | 0.000685 | -0.385136 | 0.261417 | -0.611519 | |
| 50% | -1.322907 | 0.172782 | 0.387026 | 0.155008 | 0.634702 | 0.123153 | |
| 75% | -0.939131 | 0.659825 | 0.855356 | 0.657063 | 1.111320 | 0.651518 | |
| max | -0.080377 | 2.029172 | 2.029172 | 2.029172 | 2.029172 | 2.029172 | |
| order_payment_total | count | 16574.000000 | 12241.000000 | 22386.000000 | 17658.000000 | 13774.000000 | 13455.000000 |
| mean | -0.046483 | 1.507636 | -0.387193 | -0.204890 | -0.555395 | 0.167305 | |
| std | 0.764676 | 0.765778 | 0.639581 | 0.760486 | 0.886148 | 0.962105 | |
| min | -2.363585 | 0.346115 | -5.884510 | -2.653588 | -2.880980 | -2.519897 | |
| 25% | -0.603530 | 0.908437 | -0.855781 | -0.739556 | -1.262921 | -0.511822 | |
| 50% | -0.023776 | 1.327801 | -0.339619 | -0.194068 | -0.602619 | 0.136490 | |
| 75% | 0.488474 | 1.946468 | 0.132223 | 0.339448 | 0.074357 | 0.729043 | |
| max | 3.142781 | 6.011733 | 1.069438 | 2.724203 | 2.799278 | 3.702425 | |
| order_avg_shipping_distance | count | 16574.000000 | 12241.000000 | 22386.000000 | 17658.000000 | 13774.000000 | 13455.000000 |
| mean | 0.701171 | 0.260547 | 0.173551 | 0.229802 | -1.854251 | 0.207130 | |
| std | 0.515794 | 0.698466 | 0.494547 | 0.593050 | 0.731379 | 0.777514 | |
| min | -2.455529 | -3.402135 | -1.194086 | -2.896332 | -9.099968 | -3.268441 | |
| 25% | 0.351500 | -0.033263 | -0.068153 | -0.033366 | -2.257484 | -0.045553 | |
| 50% | 0.704615 | 0.293783 | 0.180157 | 0.268235 | -1.830146 | 0.290986 | |
| 75% | 1.110950 | 0.696150 | 0.466646 | 0.611189 | -1.303117 | 0.688105 | |
| max | 2.363217 | 1.682305 | 1.644676 | 1.672831 | -0.417900 | 1.937106 | |
| recency | count | 16574.000000 | 12241.000000 | 22386.000000 | 17658.000000 | 13774.000000 | 13455.000000 |
| mean | 0.185172 | 0.221262 | 0.679074 | -1.314095 | 0.208266 | -0.047836 | |
| std | 0.809284 | 0.811400 | 0.551126 | 0.555647 | 0.889173 | 0.921458 | |
| min | -2.966200 | -2.943332 | -0.692917 | -3.102527 | -2.966307 | -3.159611 | |
| 25% | -0.313811 | -0.297297 | 0.228002 | -1.746987 | -0.341959 | -0.544626 | |
| 50% | 0.216229 | 0.280888 | 0.730799 | -1.308218 | 0.307073 | 0.092331 | |
| 75% | 0.816312 | 0.855344 | 1.155937 | -0.876884 | 0.913221 | 0.561947 | |
| max | 1.872943 | 1.609604 | 1.611766 | -0.179868 | 1.558554 | 1.878712 | |
| review_score | count | 16574.000000 | 12241.000000 | 22386.000000 | 17658.000000 | 13774.000000 | 13455.000000 |
| mean | 0.304431 | 0.334723 | 0.389805 | 0.352706 | 0.288855 | -2.086654 | |
| std | 0.570187 | 0.566681 | 0.482900 | 0.521323 | 0.672779 | 0.411360 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -0.057397 | -0.057397 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.803942 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "GaussianMixture_4"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| customer_zip_code_density_3digits | order_payment_total | order_avg_shipping_distance | recency | review_score | weights | |
|---|---|---|---|---|---|---|
| 0 | -0.220342 | 0.037141 | 0.447871 | -0.935634 | 0.539340 | 0.239195 |
| 1 | 0.602340 | -0.246930 | -1.378911 | 0.107780 | 0.530361 | 0.181423 |
| 2 | -0.164325 | 0.052196 | 0.387504 | 0.707930 | 0.498318 | 0.311506 |
| 3 | -0.020106 | 0.073376 | 0.083354 | -0.060772 | -1.420269 | 0.267876 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 24074.000000 | 16112.000000 | 32644.000000 | 23258.000000 |
| mean | -0.189880 | 0.609766 | -0.141641 | -0.027073 | |
| std | 1.021788 | 0.646345 | 1.005521 | 1.009244 | |
| min | -5.508351 | -2.924617 | -5.508351 | -5.508351 | |
| 25% | -0.889012 | 0.253294 | -0.818091 | -0.669298 | |
| 50% | -0.138157 | 0.620478 | -0.074772 | 0.123153 | |
| 75% | 0.537013 | 1.111320 | 0.558515 | 0.657063 | |
| max | 2.029172 | 2.029172 | 2.029172 | 2.029172 | |
| order_payment_total | count | 24074.000000 | 16112.000000 | 32644.000000 | 23258.000000 |
| mean | 0.044854 | -0.300816 | 0.052961 | 0.087629 | |
| std | 0.976213 | 1.065009 | 0.932405 | 1.030913 | |
| min | -2.438110 | -2.936358 | -2.433369 | -5.884510 | |
| 25% | -0.648821 | -1.091921 | -0.611027 | -0.640532 | |
| 50% | -0.026300 | -0.368431 | 0.018480 | 0.030883 | |
| 75% | 0.602866 | 0.337047 | 0.598416 | 0.694020 | |
| max | 5.163559 | 4.695665 | 4.673832 | 6.011733 | |
| order_avg_shipping_distance | count | 24074.000000 | 16112.000000 | 32644.000000 | 23258.000000 |
| mean | 0.453845 | -1.603194 | 0.397929 | 0.082329 | |
| std | 0.485558 | 0.773416 | 0.502062 | 0.946841 | |
| min | -1.005127 | -9.099968 | -1.176265 | -9.099968 | |
| 25% | 0.089959 | -2.101229 | 0.052576 | -0.196106 | |
| 50% | 0.396200 | -1.506301 | 0.338010 | 0.258989 | |
| 75% | 0.737213 | -0.982451 | 0.707413 | 0.679364 | |
| max | 2.014664 | 1.342269 | 2.363217 | 2.014664 | |
| recency | count | 24074.000000 | 16112.000000 | 32644.000000 | 23258.000000 |
| mean | -1.017375 | 0.124811 | 0.726119 | -0.052546 | |
| std | 0.620310 | 0.961120 | 0.487279 | 0.966996 | |
| min | -2.971724 | -2.966307 | -0.363118 | -3.159611 | |
| 25% | -1.490896 | -0.503110 | 0.316364 | -0.665508 | |
| 50% | -0.949864 | 0.216661 | 0.739625 | 0.092498 | |
| 75% | -0.492085 | 0.901114 | 1.146569 | 0.647100 | |
| max | 0.099004 | 1.558402 | 1.872943 | 1.878712 | |
| review_score | count | 24074.000000 | 16112.000000 | 32644.000000 | 23258.000000 |
| mean | 0.528513 | 0.521642 | 0.476780 | -1.577614 | |
| std | 0.305889 | 0.310923 | 0.335655 | 0.723828 | |
| min | -0.181821 | -0.803942 | -0.306246 | -2.297030 | |
| 25% | 0.689147 | 0.689147 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | -1.550486 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | -0.803942 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_4"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| customer_zip_code_density_3digits | count | 35651.000000 | 12928.000000 | 6969.000000 | 34632.000000 | 5908.000000 |
| mean | -0.729513 | 0.422666 | -0.029539 | 0.471002 | 0.751143 | |
| std | 0.885914 | 0.753276 | 0.927602 | 0.755632 | 0.675964 | |
| min | -5.508351 | -5.508351 | -3.223741 | -2.081436 | -2.924617 | |
| 25% | -1.341643 | 0.031420 | -0.659443 | 0.026360 | 0.308867 | |
| 50% | -0.659443 | 0.518221 | 0.085510 | 0.518221 | 0.689729 | |
| 75% | -0.080377 | 0.882423 | 0.657063 | 0.974008 | 1.203577 | |
| max | 2.029172 | 2.029172 | 2.029172 | 2.029172 | 2.029172 | |
| order_payment_total | count | 35651.000000 | 12928.000000 | 6969.000000 | 34632.000000 | 5908.000000 |
| mean | -0.223541 | -0.482778 | 2.001167 | 0.064790 | -0.334994 | |
| std | 0.812401 | 0.831492 | 0.750892 | 0.794199 | 0.856930 | |
| min | -5.884510 | -2.880980 | 0.071625 | -2.936358 | -2.519897 | |
| 25% | -0.840483 | -1.104311 | 1.456042 | -0.440388 | -0.939266 | |
| 50% | -0.280953 | -0.521430 | 1.934311 | 0.118773 | -0.408640 | |
| 75% | 0.367999 | 0.092109 | 2.459251 | 0.592727 | 0.269664 | |
| max | 3.365019 | 3.868849 | 6.011733 | 2.913407 | 2.703801 | |
| order_avg_shipping_distance | count | 35651.000000 | 12928.000000 | 6969.000000 | 34632.000000 | 5908.000000 |
| mean | 0.584222 | -1.600997 | 0.182317 | 0.057263 | -0.572801 | |
| std | 0.483007 | 0.980815 | 0.896408 | 0.686364 | 0.993702 | |
| min | -1.495959 | -9.099968 | -4.551001 | -2.947535 | -4.870958 | |
| 25% | 0.230776 | -2.224723 | -0.078463 | -0.246177 | -1.300883 | |
| 50% | 0.570862 | -1.735495 | 0.298793 | 0.133493 | -0.306094 | |
| 75% | 0.916323 | -0.909166 | 0.736763 | 0.481690 | 0.212208 | |
| max | 2.014664 | 1.129546 | 1.727942 | 2.363217 | 1.536029 | |
| recency | count | 35651.000000 | 12928.000000 | 6969.000000 | 34632.000000 | 5908.000000 |
| mean | -0.216985 | -0.249411 | -0.089871 | 0.337589 | -0.017762 | |
| std | 1.000006 | 1.030787 | 1.010333 | 0.890754 | 0.975765 | |
| min | -3.159611 | -2.970209 | -2.966073 | -2.966205 | -3.102527 | |
| 25% | -0.976851 | -1.010233 | -0.793205 | -0.233847 | -0.638734 | |
| 50% | -0.174464 | -0.212812 | 0.012648 | 0.542194 | 0.171288 | |
| 75% | 0.527115 | 0.544771 | 0.723374 | 1.051380 | 0.691362 | |
| max | 1.872943 | 1.558554 | 1.609604 | 1.611766 | 1.878712 | |
| review_score | count | 35651.000000 | 12928.000000 | 6969.000000 | 34632.000000 | 5908.000000 |
| mean | -0.186849 | 0.400356 | -0.303362 | 0.415872 | -1.828502 | |
| std | 1.107788 | 0.482149 | 1.208009 | 0.454249 | 0.626515 | |
| min | -2.297030 | -2.297030 | -2.297030 | -1.550486 | -2.297030 | |
| 25% | -0.803942 | -0.057397 | -1.550486 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | -0.057397 | 0.689147 | -2.297030 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -1.550486 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.057397 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = ['order_is_delivered']
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total', 'order_avg_shipping_distance']
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version5"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_5"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| order_payment_total | order_avg_shipping_distance | recency | review_score | order_is_delivered | fraction | |
|---|---|---|---|---|---|---|
| 0 | 1.222386 | 0.373326 | 0.348640 | 0.362698 | 0.990288 | 0.191949 |
| 1 | -0.501220 | 0.336606 | 0.649782 | 0.378464 | 0.992324 | 0.285676 |
| 2 | -0.542056 | -1.827795 | 0.234394 | 0.309921 | 0.985954 | 0.147698 |
| 3 | -0.160672 | 0.315015 | -1.242445 | 0.358454 | 0.983768 | 0.225595 |
| 4 | 0.176591 | 0.210336 | -0.047706 | -2.041381 | 0.787325 | 0.149082 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| order_payment_total | count | 18444.000000 | 27450.000000 | 14192.000000 | 21677.000000 | 14325.000000 |
| mean | 1.218019 | -0.504905 | -0.541787 | -0.158958 | 0.176564 | |
| std | 0.743828 | 0.573392 | 0.883788 | 0.761984 | 0.976849 | |
| min | 0.213980 | -5.884510 | -2.880980 | -2.438110 | -2.519897 | |
| 25% | 0.658502 | -0.922446 | -1.232545 | -0.686820 | -0.497339 | |
| 50% | 0.991529 | -0.470996 | -0.583816 | -0.145506 | 0.136994 | |
| 75% | 1.565738 | -0.034134 | 0.085039 | 0.355625 | 0.719913 | |
| max | 6.011733 | 0.550349 | 3.244561 | 3.187162 | 4.407333 | |
| order_avg_shipping_distance | count | 18444.000000 | 27450.000000 | 14192.000000 | 21677.000000 | 14325.000000 |
| mean | 0.372967 | 0.336257 | -1.828738 | 0.315140 | 0.210326 | |
| std | 0.666342 | 0.514318 | 0.736200 | 0.609461 | 0.791673 | |
| min | -2.925142 | -0.969498 | -9.099968 | -2.278341 | -3.268441 | |
| 25% | 0.027909 | 0.028795 | -2.242581 | 0.009227 | -0.046272 | |
| 50% | 0.377506 | 0.309604 | -1.803011 | 0.318268 | 0.293766 | |
| 75% | 0.796115 | 0.674459 | -1.264441 | 0.696745 | 0.697005 | |
| max | 2.014664 | 2.363217 | -0.530850 | 1.678877 | 2.014664 | |
| recency | count | 18444.000000 | 27450.000000 | 14192.000000 | 21677.000000 | 14325.000000 |
| mean | 0.352481 | 0.648422 | 0.234312 | -1.242865 | -0.047757 | |
| std | 0.731725 | 0.549582 | 0.884988 | 0.557835 | 0.912747 | |
| min | -2.241105 | -0.619062 | -2.966307 | -2.971724 | -3.159611 | |
| 25% | -0.182593 | 0.189264 | -0.299924 | -1.648384 | -0.521958 | |
| 50% | 0.405028 | 0.680230 | 0.339376 | -1.192226 | 0.084021 | |
| 75% | 0.919643 | 1.130180 | 0.938786 | -0.793511 | 0.549255 | |
| max | 1.872943 | 1.611766 | 1.558554 | -0.188007 | 1.878712 | |
| review_score | count | 18444.000000 | 27450.000000 | 14192.000000 | 21677.000000 | 14325.000000 |
| mean | 0.362834 | 0.378109 | 0.309942 | 0.358692 | -2.041553 | |
| std | 0.511773 | 0.488120 | 0.641450 | 0.510102 | 0.459758 | |
| min | -2.297030 | -2.297030 | -2.297030 | -1.550486 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -0.057397 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -2.297030 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -1.550486 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | -0.803942 | |
| order_is_delivered | count | 18444.000000 | 27450.000000 | 14192.000000 | 21677.000000 | 14325.000000 |
| mean | 0.990385 | 0.992231 | 0.985943 | 0.983815 | 0.787295 | |
| std | 0.117564 | 0.103085 | 0.142157 | 0.152344 | 0.540467 | |
| min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | |
| 25% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "GaussianMixture_5"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| order_payment_total | order_avg_shipping_distance | recency | review_score | order_is_delivered | weights | |
|---|---|---|---|---|---|---|
| 0 | -0.027326 | -0.052505 | 0.025305 | 0.689147 | 1.000000 | 0.565499 |
| 1 | -0.031403 | 0.036229 | -0.013010 | -0.057397 | 1.000000 | 0.189204 |
| 2 | 0.081195 | 0.098462 | -0.007002 | -1.526364 | 1.000000 | 0.214813 |
| 3 | 0.129668 | 0.055309 | -0.339342 | -1.672019 | -0.361462 | 0.030484 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| order_payment_total | count | 5.433800e+04 | 1.818400e+04 | 20637.000000 | 2929.000000 |
| mean | -2.732709e-02 | -3.141169e-02 | 0.081228 | 0.129664 | |
| std | 9.925586e-01 | 9.804662e-01 | 1.012875 | 1.118398 | |
| min | -2.936358e+00 | -2.653588e+00 | -2.791709 | -5.884510 | |
| 25% | -7.264135e-01 | -7.104253e-01 | -0.637531 | -0.631551 | |
| 50% | -7.429621e-02 | -8.075732e-02 | 0.033021 | 0.019422 | |
| 75% | 5.639197e-01 | 5.534941e-01 | 0.684597 | 0.737733 | |
| max | 5.163559e+00 | 4.695665e+00 | 6.011733 | 4.707433 | |
| order_avg_shipping_distance | count | 5.433800e+04 | 1.818400e+04 | 20637.000000 | 2929.000000 |
| mean | -5.250456e-02 | 3.623316e-02 | 0.098472 | 0.055299 | |
| std | 1.023410e+00 | 9.811898e-01 | 0.952456 | 0.936631 | |
| min | -9.099968e+00 | -4.328646e+00 | -9.099968 | -3.796466 | |
| 25% | -5.011245e-01 | -3.104654e-01 | -0.134035 | -0.167079 | |
| 50% | 1.802282e-01 | 2.558893e-01 | 0.277435 | 0.179656 | |
| 75% | 6.254437e-01 | 6.753223e-01 | 0.686727 | 0.624863 | |
| max | 2.363217e+00 | 1.687008e+00 | 2.014664 | 1.727942 | |
| recency | count | 5.433800e+04 | 1.818400e+04 | 20637.000000 | 2929.000000 |
| mean | 2.530500e-02 | -1.301417e-02 | -0.006998 | -0.339351 | |
| std | 1.012160e+00 | 9.959295e-01 | 0.944550 | 1.108695 | |
| min | -2.970209e+00 | -2.971724e+00 | -2.970562 | -3.159611 | |
| 25% | -7.224714e-01 | -7.248030e-01 | -0.577506 | -1.184945 | |
| 50% | 1.386439e-01 | 1.032629e-01 | 0.143170 | -0.255270 | |
| 75% | 8.603117e-01 | 7.939260e-01 | 0.680678 | 0.497361 | |
| max | 1.558593e+00 | 1.553150e+00 | 1.558554 | 1.878712 | |
| review_score | count | 5.433800e+04 | 1.818400e+04 | 20637.000000 | 2929.000000 |
| mean | 6.891468e-01 | -5.739745e-02 | -1.526663 | -1.672039 | |
| std | 2.220466e-16 | 2.081725e-17 | 0.733201 | 0.985003 | |
| min | 6.891468e-01 | -5.739745e-02 | -2.297030 | -2.297030 | |
| 25% | 6.891468e-01 | -5.739745e-02 | -2.297030 | -2.297030 | |
| 50% | 6.891468e-01 | -5.739745e-02 | -1.550486 | -2.297030 | |
| 75% | 6.891468e-01 | -5.739745e-02 | -0.803942 | -0.803942 | |
| max | 6.891468e-01 | -5.739745e-02 | 0.564723 | 0.689147 | |
| order_is_delivered | count | 5.433800e+04 | 1.818400e+04 | 20637.000000 | 2929.000000 |
| mean | 1.000000e+00 | 1.000000e+00 | 1.000000 | -0.361532 | |
| std | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.506387 | |
| min | 1.000000e+00 | 1.000000e+00 | 1.000000 | -1.000000 | |
| 25% | 1.000000e+00 | 1.000000e+00 | 1.000000 | -1.000000 | |
| 50% | 1.000000e+00 | 1.000000e+00 | 1.000000 | 0.000000 | |
| 75% | 1.000000e+00 | 1.000000e+00 | 1.000000 | 0.000000 | |
| max | 1.000000e+00 | 1.000000e+00 | 1.000000 | 0.823529 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
model_name = "BIRCH_5"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| order_payment_total | count | 19154.000000 | 30998.000000 | 31804.000000 | 14132.000000 |
| mean | -0.542640 | -0.466296 | 0.886097 | -0.235879 | |
| std | 0.892557 | 0.706996 | 0.800138 | 0.695591 | |
| min | -5.884510 | -2.433369 | -0.631551 | -2.363585 | |
| 25% | -1.190434 | -0.961889 | 0.319102 | -0.687015 | |
| 50% | -0.618476 | -0.522882 | 0.699203 | -0.213928 | |
| 75% | 0.019034 | -0.111267 | 1.296369 | 0.252327 | |
| max | 3.798729 | 2.738275 | 6.011733 | 2.490463 | |
| order_avg_shipping_distance | count | 19154.000000 | 30998.000000 | 31804.000000 | 14132.000000 |
| mean | -1.564805 | 0.451791 | 0.334056 | 0.378102 | |
| std | 0.827074 | 0.474031 | 0.608480 | 0.620963 | |
| min | -9.099968 | -1.740387 | -3.345493 | -2.368543 | |
| 25% | -2.109834 | 0.093870 | 0.028314 | 0.024786 | |
| 50% | -1.522912 | 0.394599 | 0.324595 | 0.348837 | |
| 75% | -0.935212 | 0.726483 | 0.699805 | 0.750697 | |
| max | 0.954075 | 2.363217 | 1.794412 | 2.014664 | |
| recency | count | 19154.000000 | 30998.000000 | 31804.000000 | 14132.000000 |
| mean | 0.148840 | 0.402217 | 0.162807 | -1.450375 | |
| std | 0.971940 | 0.685405 | 0.860037 | 0.525311 | |
| min | -2.976835 | -1.869058 | -2.966621 | -3.159611 | |
| 25% | -0.494653 | -0.146337 | -0.377773 | -1.837980 | |
| 50% | 0.286611 | 0.419476 | 0.220994 | -1.467495 | |
| 75% | 0.925993 | 0.960908 | 0.835075 | -1.086927 | |
| max | 1.590980 | 1.878712 | 1.655003 | 0.810309 | |
| review_score | count | 19154.000000 | 30998.000000 | 31804.000000 | 14132.000000 |
| mean | 0.111650 | -0.286383 | 0.263903 | -0.117069 | |
| std | 0.933790 | 1.193978 | 0.705357 | 1.001937 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.057397 | -1.550486 | -0.057397 | -0.803942 | |
| 50% | 0.689147 | 0.689147 | 0.689147 | -0.057397 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| order_is_delivered | count | 19154.000000 | 30998.000000 | 31804.000000 | 14132.000000 |
| mean | 0.980569 | 0.940725 | 0.983089 | 0.912220 | |
| std | 0.167308 | 0.297137 | 0.161456 | 0.361285 | |
| min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | |
| 25% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
<AxesSubplot:xlabel='labels', ylabel='value'>
passthrough_features = ['order_is_delivered']
standardscale_features = ['recency', 'review_score', 'order_payment_total', 'order_avg_shipping_distance']
log1pscale_features = []
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
projection_version = "Version5b"
all_my_projections[projection_version] = {}
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
all_my_projections[projection_version]["PCA"] = {"X_proj" : X_pca, "model": pca}
mds = MDS(n_components=2)
X_reduced_mds = mds.fit_transform(X_reduced)
all_my_projections[projection_version]["MDS"] = {"X_proj" : X_reduced_mds, "model": mds}
iso = Isomap(n_components=2)
X_reduced_iso = iso.fit_transform(X_reduced)
X_iso = iso.transform(X_scaled)
all_my_projections[projection_version]["Isomap"] = {"X_proj" : X_iso, "model": iso}
tsne = TSNE(n_components=2, init='pca')
X_tsne0 = tsne.fit_transform(X_scaled)
all_my_projections[projection_version]["TSNE_0"] = {"X_proj" : X_tsne0, "model": tsne}
dill.dump_session('notebook_env.db')
dill.load_session('notebook_env.db')
model_name = "KMeans_5b"
model_dict = test_model(X_reduced)
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 5
km = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = km.fit_predict(X_scaled)
X_labels_reduced = km.predict(X_reduced.astype(float))
projection_version = "Version5b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(km.cluster_centers_, columns=X_scaled.columns).merge(
pd.Series(km.labels_, name="fraction").value_counts(normalize=True).sort_index(),
right_index=True, left_index=True)
| recency | review_score | order_payment_total | order_avg_shipping_distance | order_is_delivered | fraction | |
|---|---|---|---|---|---|---|
| 0 | 0.763848 | 0.441564 | -0.145306 | -0.354301 | 0.993756 | 0.407522 |
| 1 | -0.023553 | 0.001813 | 0.066567 | 2.324600 | 0.965664 | 0.107121 |
| 2 | -0.018411 | -1.838200 | -0.055128 | -0.188872 | 0.817553 | 0.163881 |
| 3 | -1.017231 | 0.409528 | -0.138947 | -0.263598 | 0.987339 | 0.301078 |
| 4 | 0.003493 | -0.108833 | 5.055249 | 0.280953 | 0.922926 | 0.020398 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|---|
| recency | count | 39158.000000 | 10293.000000 | 15747.000000 | 28930.000000 | 1960.000000 |
| mean | 0.764443 | -0.023609 | -0.018411 | -1.016407 | 0.001790 | |
| std | 0.476141 | 0.960298 | 0.867159 | 0.618614 | 1.017357 | |
| min | -0.166378 | -3.159611 | -3.158795 | -2.971724 | -2.964532 | |
| 25% | 0.361021 | -0.682222 | -0.417369 | -1.483568 | -0.765105 | |
| 50% | 0.782628 | 0.079038 | 0.103684 | -0.955636 | 0.134644 | |
| 75% | 1.179999 | 0.744167 | 0.514682 | -0.473506 | 0.854440 | |
| max | 1.872943 | 1.550253 | 1.878712 | -0.069297 | 1.609604 | |
| review_score | count | 39158.000000 | 10293.000000 | 15747.000000 | 28930.000000 | 1960.000000 |
| mean | 0.441514 | 0.002031 | -1.838200 | 0.409647 | -0.109516 | |
| std | 0.423946 | 0.937974 | 0.614764 | 0.453527 | 1.100799 | |
| min | -0.803942 | -2.297030 | -2.297030 | -1.550486 | -2.297030 | |
| 25% | -0.057397 | -0.057397 | -2.297030 | -0.057397 | -0.803942 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | -1.550486 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | -0.617306 | 0.689147 | 0.689147 | |
| order_payment_total | count | 39158.000000 | 10293.000000 | 15747.000000 | 28930.000000 | 1960.000000 |
| mean | -0.145236 | 0.066196 | -0.055128 | -0.139237 | 5.052056 | |
| std | 0.480485 | 0.637850 | 0.566872 | 0.497941 | 3.124323 | |
| min | -0.726021 | -0.627107 | -0.726021 | -0.680725 | 2.465769 | |
| 25% | -0.464724 | -0.363471 | -0.433642 | -0.460631 | 3.199955 | |
| 50% | -0.280998 | -0.115534 | -0.227987 | -0.290017 | 4.081456 | |
| 75% | 0.005376 | 0.243553 | 0.117920 | -0.001630 | 5.739031 | |
| max | 2.703360 | 3.748593 | 2.960069 | 2.804119 | 60.736923 | |
| order_avg_shipping_distance | count | 39158.000000 | 10293.000000 | 15747.000000 | 28930.000000 | 1960.000000 |
| mean | -0.354219 | 2.324387 | -0.188872 | -0.263800 | 0.281370 | |
| std | 0.516540 | 0.731816 | 0.614941 | 0.539147 | 1.132120 | |
| min | -1.015365 | 0.778944 | -1.015365 | -1.015365 | -1.012663 | |
| 25% | -0.830832 | 1.691737 | -0.604688 | -0.709203 | -0.490169 | |
| 50% | -0.424279 | 2.433517 | -0.299491 | -0.340076 | -0.098264 | |
| 75% | -0.049753 | 2.847580 | 0.158392 | 0.097937 | 0.789290 | |
| max | 1.414568 | 13.585715 | 2.089006 | 1.917140 | 4.416886 | |
| order_is_delivered | count | 39158.000000 | 10293.000000 | 15747.000000 | 28930.000000 | 1960.000000 |
| mean | 0.993752 | 0.965664 | 0.817553 | 0.987418 | 0.922024 | |
| std | 0.093111 | 0.211210 | 0.507386 | 0.134946 | 0.355163 | |
| min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | |
| 25% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
model_name = "GaussianMixture_5b"
model_dict = test_model(X_reduced, "GaussianMixture")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 3
gm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
gm.set_params(n_init=10)
X_labels = gm.fit_predict(X_scaled)
X_labels_reduced = gm.predict(X_reduced.astype(float))
projection_version = "Version5b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
pd.DataFrame(gm.means_, columns=X_scaled.columns).merge(pd.Series(gm.weights_, name="weights"),
left_index=True, right_index=True)
| recency | review_score | order_payment_total | order_avg_shipping_distance | order_is_delivered | weights | |
|---|---|---|---|---|---|---|
| 0 | -0.030476 | -0.081337 | 0.540411 | 0.959851 | 1.00000 | 0.256573 |
| 1 | 0.026107 | 0.103256 | -0.276487 | -0.355473 | 1.00000 | 0.706334 |
| 2 | -0.286337 | -1.403610 | 1.526897 | 0.129694 | -0.11888 | 0.037093 |
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | |
|---|---|---|---|---|
| recency | count | 22536.000000 | 69996.000000 | 3556.000000 |
| mean | -0.025517 | 0.022855 | -0.288161 | |
| std | 1.002943 | 0.991045 | 1.103214 | |
| min | -2.967019 | -2.971724 | -3.159611 | |
| 25% | -0.728889 | -0.682693 | -1.139073 | |
| 50% | 0.098322 | 0.144555 | -0.242277 | |
| 75% | 0.789380 | 0.822535 | 0.580128 | |
| max | 1.558402 | 1.558593 | 1.878712 | |
| review_score | count | 22536.000000 | 69996.000000 | 3556.000000 |
| mean | -0.086599 | 0.099419 | -1.408126 | |
| std | 1.054635 | 0.912454 | 1.166300 | |
| min | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.803942 | -0.057397 | -2.297030 | |
| 50% | 0.689147 | 0.689147 | -2.297030 | |
| 75% | 0.689147 | 0.689147 | -0.803942 | |
| max | 0.689147 | 0.689147 | 0.689147 | |
| order_payment_total | count | 22536.000000 | 69996.000000 | 3556.000000 |
| mean | 0.612174 | -0.274506 | 1.523721 | |
| std | 1.030314 | 0.250112 | 3.589772 | |
| min | -0.632910 | -0.682884 | -0.726021 | |
| 25% | -0.167184 | -0.477814 | -0.392957 | |
| 50% | 0.445913 | -0.328971 | -0.084609 | |
| 75% | 0.969035 | -0.108112 | 1.266666 | |
| max | 5.364865 | 0.462477 | 60.736923 | |
| order_avg_shipping_distance | count | 22536.000000 | 69996.000000 | 3556.000000 |
| mean | 1.063620 | -0.349102 | 0.131050 | |
| std | 1.350990 | 0.485458 | 1.109439 | |
| min | -1.015365 | -1.015365 | -1.012639 | |
| 25% | -0.208285 | -0.795404 | -0.552296 | |
| 50% | 1.068316 | -0.409586 | -0.247087 | |
| 75% | 2.282846 | -0.043245 | 0.435966 | |
| max | 13.585715 | 1.012539 | 5.008011 | |
| order_is_delivered | count | 22536.000000 | 69996.000000 | 3556.000000 |
| mean | 1.000000 | 1.000000 | -0.121464 | |
| std | 0.000000 | 0.000000 | 0.693184 | |
| min | 1.000000 | 1.000000 | -1.000000 | |
| 25% | 1.000000 | 1.000000 | -1.000000 | |
| 50% | 1.000000 | 1.000000 | 0.000000 | |
| 75% | 1.000000 | 1.000000 | 0.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
model_name = "BIRCH_5b"
model_dict = test_model(X_reduced, "Birch")
all_my_models[model_name] = model_dict
plot_model_evaluation(model_dict)
n_clusters = 4
bm = all_my_models[model_name]["models"][all_my_models[model_name]["n_clusters"].index(n_clusters)]
X_labels = bm.fit_predict(X_scaled)
X_labels_reduced = bm.predict(X_reduced.astype(float))
projection_version = "Version5b"
projections_dict = create_projections_dict(projection_version, X_labels, X_labels_reduced)
plot_model_2D_projections(projections_dict)
sns.color_palette()
data_dict = {("Group"+ str(c)) : X_scaled[X_labels == c] for c in set(X_labels)}
data = X_scaled.copy()
data['labels'] = X_labels
data.groupby('labels').describe().T
| labels | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|
| recency | count | 75.000000 | 9770.000000 | 904.000000 | 85339.000000 |
| mean | -0.104255 | -0.188613 | -0.067606 | 0.022401 | |
| std | 1.262113 | 1.003125 | 1.006693 | 0.997016 | |
| min | -2.189398 | -3.159611 | -2.959412 | -3.158795 | |
| 25% | -1.478548 | -0.931911 | -0.789569 | -0.681181 | |
| 50% | 0.261107 | -0.128086 | -0.097492 | 0.151181 | |
| 75% | 0.975518 | 0.599064 | 0.792148 | 0.823463 | |
| max | 1.481748 | 1.550253 | 1.609604 | 1.878712 | |
| review_score | count | 75.000000 | 9770.000000 | 904.000000 | 85339.000000 |
| mean | -0.226614 | -0.182721 | -0.041707 | 0.021560 | |
| std | 1.155064 | 1.077122 | 1.063084 | 0.987771 | |
| min | -2.297030 | -2.297030 | -2.297030 | -2.297030 | |
| 25% | -0.803942 | -0.803942 | -0.057397 | -0.057397 | |
| 50% | 0.689147 | -0.057397 | 0.689147 | 0.689147 | |
| 75% | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| max | 0.689147 | 0.689147 | 0.689147 | 0.689147 | |
| order_payment_total | count | 75.000000 | 9770.000000 | 904.000000 | 85339.000000 |
| mean | 15.673564 | 0.181911 | 5.843565 | -0.096502 | |
| std | 7.087605 | 0.967242 | 1.915436 | 0.588014 | |
| min | 10.575690 | -0.627107 | 2.472471 | -0.726021 | |
| 25% | 11.993414 | -0.361132 | 4.367695 | -0.455683 | |
| 50% | 13.337413 | -0.108112 | 5.366462 | -0.267931 | |
| 75% | 16.410520 | 0.277828 | 7.190053 | 0.030926 | |
| max | 60.736923 | 7.469005 | 11.688305 | 4.806691 | |
| order_avg_shipping_distance | count | 75.000000 | 9770.000000 | 904.000000 | 85339.000000 |
| mean | 0.343279 | 2.377425 | 0.119458 | -0.273746 | |
| std | 1.042422 | 0.740307 | 0.927075 | 0.575249 | |
| min | -0.969350 | 0.252310 | -1.010641 | -1.015365 | |
| 25% | -0.369565 | 1.859913 | -0.520712 | -0.764473 | |
| 50% | 0.020710 | 2.503835 | -0.125015 | -0.376190 | |
| 75% | 0.882870 | 2.885412 | 0.629599 | 0.065974 | |
| max | 3.728863 | 13.585715 | 4.162164 | 2.254462 | |
| order_is_delivered | count | 75.000000 | 9770.000000 | 904.000000 | 85339.000000 |
| mean | 0.888889 | 0.957276 | 0.898783 | 0.959331 | |
| std | 0.432651 | 0.238444 | 0.410262 | 0.248977 | |
| min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | |
| 25% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="labels", y="value", hue="Feature", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
fig, ax = plt.subplots(figsize = (16,6))
sns.boxplot(x="Feature", y="value", hue="labels", data=pd.melt(data, id_vars=['labels'], var_name=['Feature']))
plt.ylim(-5, 15)
(-5.0, 15.0)
df_summary = pd.DataFrame()
model_name_list =[]
n_clusters_list = []
silouette_list = []
d_b_list = []
c_h_list = []
for key, dic in all_my_models.items():
if key.startswith('KMeans'):
for n in range(3,11):
model_name_list.append(key)
n_clusters_list.append(n)
i = dic['n_clusters'].index(n)
silouette_list.append(dic['silouette_score'][i])
c_h_list.append(dic['c_h_score'][i])
d_b_list.append(dic['d_b_score'][i])
df_summary["Model"] = model_name_list
df_summary["N_Clusters"] = n_clusters_list
df_summary["Silouette"] = silouette_list
df_summary["C_H"] = c_h_list
df_summary["D_B"] = d_b_list
for c, asc in zip(["Silouette", "C_H", "D_B"], [False, False, True]) :
df_summary[c+"_Rank"] = np.zeros(len(model_name_list))
for rank, ind in enumerate(df_summary.sort_values(c, ascending=asc).index.values) :
df_summary.loc[ind,c+"_Rank"] = rank+1
df_summary.sort_values("Silouette", ascending=False).iloc[:50,:]
| Model | N_Clusters | Silouette | C_H | D_B | Silouette_Rank | C_H_Rank | D_B_Rank | |
|---|---|---|---|---|---|---|---|---|
| 18 | KMeans_1b | 5 | 0.418858 | 2792.896364 | 0.801244 | 1.0 | 3.0 | 1.0 |
| 17 | KMeans_1b | 4 | 0.410601 | 2968.437162 | 0.814622 | 2.0 | 1.0 | 2.0 |
| 16 | KMeans_1b | 3 | 0.388658 | 2112.338882 | 0.974255 | 3.0 | 11.0 | 12.0 |
| 23 | KMeans_1b | 10 | 0.367159 | 2637.637163 | 0.888825 | 4.0 | 7.0 | 3.0 |
| 22 | KMeans_1b | 9 | 0.363167 | 2729.699580 | 0.928952 | 5.0 | 6.0 | 7.0 |
| 65 | KMeans_5b | 4 | 0.359402 | 1548.218173 | 1.045749 | 6.0 | 35.0 | 20.0 |
| 21 | KMeans_1b | 8 | 0.356197 | 2806.091801 | 0.889118 | 7.0 | 2.0 | 4.0 |
| 64 | KMeans_5b | 3 | 0.345894 | 1350.215255 | 1.266354 | 8.0 | 42.0 | 44.0 |
| 67 | KMeans_5b | 6 | 0.341951 | 1790.173315 | 0.938492 | 9.0 | 27.0 | 8.0 |
| 19 | KMeans_1b | 6 | 0.341312 | 2756.206418 | 0.893452 | 10.0 | 5.0 | 5.0 |
| 66 | KMeans_5b | 5 | 0.337394 | 1937.506560 | 0.942786 | 11.0 | 17.0 | 10.0 |
| 20 | KMeans_1b | 7 | 0.336841 | 2757.283534 | 0.973523 | 12.0 | 4.0 | 11.0 |
| 59 | KMeans_2b | 6 | 0.336546 | 1773.212441 | 0.917079 | 13.0 | 28.0 | 6.0 |
| 58 | KMeans_2b | 5 | 0.330917 | 1898.655788 | 0.939025 | 14.0 | 21.0 | 9.0 |
| 8 | KMeans_1 | 3 | 0.312296 | 2088.457862 | 1.173971 | 15.0 | 12.0 | 36.0 |
| 0 | KMeans_0 | 3 | 0.308175 | 2053.533920 | 1.186426 | 16.0 | 14.0 | 37.0 |
| 9 | KMeans_1 | 4 | 0.302657 | 2237.460839 | 1.014678 | 17.0 | 8.0 | 14.0 |
| 1 | KMeans_0 | 4 | 0.298145 | 2190.710830 | 1.032201 | 18.0 | 9.0 | 16.0 |
| 57 | KMeans_2b | 4 | 0.297459 | 1527.993001 | 1.100410 | 19.0 | 37.0 | 27.0 |
| 10 | KMeans_1 | 5 | 0.292447 | 2126.159451 | 1.042950 | 20.0 | 10.0 | 19.0 |
| 2 | KMeans_0 | 5 | 0.287581 | 2073.836363 | 1.056323 | 21.0 | 13.0 | 22.0 |
| 60 | KMeans_2b | 7 | 0.287305 | 1721.964169 | 1.024410 | 22.0 | 29.0 | 15.0 |
| 15 | KMeans_1 | 10 | 0.287113 | 1875.795620 | 0.983995 | 23.0 | 23.0 | 13.0 |
| 56 | KMeans_2b | 3 | 0.284405 | 1304.408935 | 1.318924 | 24.0 | 46.0 | 48.0 |
| 7 | KMeans_0 | 10 | 0.281388 | 1791.985215 | 1.032601 | 25.0 | 26.0 | 17.0 |
| 13 | KMeans_1 | 8 | 0.281287 | 1907.063560 | 1.090399 | 26.0 | 19.0 | 25.0 |
| 11 | KMeans_1 | 6 | 0.280849 | 2013.379981 | 1.147322 | 27.0 | 15.0 | 31.0 |
| 14 | KMeans_1 | 9 | 0.280093 | 1900.720513 | 1.033756 | 28.0 | 20.0 | 18.0 |
| 3 | KMeans_0 | 6 | 0.277204 | 1956.969284 | 1.148654 | 29.0 | 16.0 | 32.0 |
| 12 | KMeans_1 | 7 | 0.275617 | 1936.383148 | 1.172514 | 30.0 | 18.0 | 35.0 |
| 6 | KMeans_0 | 9 | 0.273965 | 1831.197241 | 1.054429 | 31.0 | 24.0 | 21.0 |
| 5 | KMeans_0 | 8 | 0.272462 | 1825.156604 | 1.097390 | 32.0 | 25.0 | 26.0 |
| 4 | KMeans_0 | 7 | 0.269796 | 1877.916501 | 1.192140 | 33.0 | 22.0 | 39.0 |
| 48 | KMeans_5 | 3 | 0.267401 | 1366.569862 | 1.395349 | 34.0 | 41.0 | 58.0 |
| 63 | KMeans_2b | 10 | 0.266347 | 1611.396067 | 1.077929 | 35.0 | 34.0 | 24.0 |
| 61 | KMeans_2b | 8 | 0.264096 | 1670.549704 | 1.077769 | 36.0 | 31.0 | 23.0 |
| 62 | KMeans_2b | 9 | 0.262125 | 1637.694835 | 1.110069 | 37.0 | 32.0 | 28.0 |
| 71 | KMeans_5b | 10 | 0.261566 | 1511.872803 | 1.165464 | 38.0 | 38.0 | 34.0 |
| 68 | KMeans_5b | 7 | 0.259188 | 1684.552295 | 1.122809 | 39.0 | 30.0 | 29.0 |
| 49 | KMeans_5 | 4 | 0.255961 | 1476.324037 | 1.258837 | 40.0 | 39.0 | 43.0 |
| 69 | KMeans_5b | 8 | 0.253687 | 1615.863646 | 1.150467 | 41.0 | 33.0 | 33.0 |
| 70 | KMeans_5b | 9 | 0.248400 | 1545.808597 | 1.137375 | 42.0 | 36.0 | 30.0 |
| 50 | KMeans_5 | 5 | 0.235623 | 1446.240287 | 1.195127 | 43.0 | 40.0 | 40.0 |
| 24 | KMeans_2 | 3 | 0.231404 | 1302.734801 | 1.522537 | 44.0 | 47.0 | 69.0 |
| 25 | KMeans_2 | 4 | 0.229442 | 1312.496426 | 1.318267 | 45.0 | 45.0 | 47.0 |
| 26 | KMeans_2 | 5 | 0.229186 | 1325.925305 | 1.188906 | 46.0 | 44.0 | 38.0 |
| 51 | KMeans_5 | 6 | 0.220333 | 1333.678145 | 1.236709 | 47.0 | 43.0 | 41.0 |
| 27 | KMeans_2 | 6 | 0.216024 | 1231.020676 | 1.238758 | 48.0 | 49.0 | 42.0 |
| 52 | KMeans_5 | 7 | 0.214607 | 1241.229202 | 1.376240 | 49.0 | 48.0 | 54.0 |
| 28 | KMeans_2 | 7 | 0.209742 | 1160.280746 | 1.277929 | 50.0 | 51.0 | 45.0 |
df_summary = pd.DataFrame()
model_name_list =[]
n_clusters_list = []
silouette_list = []
d_b_list = []
c_h_list = []
for key, dic in all_my_models.items():
if not key.endswith('b'):
for n in range(4,7):
model_name_list.append(key)
n_clusters_list.append(n)
i = dic['n_clusters'].index(n)
silouette_list.append(dic['silouette_score'][i])
c_h_list.append(dic['c_h_score'][i])
d_b_list.append(dic['d_b_score'][i])
df_summary["Model"] = model_name_list
df_summary["N_Clusters"] = n_clusters_list
df_summary["Silouette"] = silouette_list
df_summary["C_H"] = c_h_list
df_summary["D_B"] = d_b_list
for c, asc in zip(["Silouette", "C_H", "D_B"], [False, False, True]) :
df_summary[c+"_Rank"] = np.zeros(len(model_name_list))
for rank, ind in enumerate(df_summary.sort_values(c, ascending=asc).index.values) :
df_summary.loc[ind,c+"_Rank"] = rank+1
df_summary.sort_values("Silouette", ascending=False).iloc[:50,:]
| Model | N_Clusters | Silouette | C_H | D_B | Silouette_Rank | C_H_Rank | D_B_Rank | |
|---|---|---|---|---|---|---|---|---|
| 9 | KMeans_1 | 4 | 0.302657 | 2237.460839 | 1.014678 | 1.0 | 1.0 | 1.0 |
| 0 | KMeans_0 | 4 | 0.298145 | 2190.710830 | 1.032201 | 2.0 | 2.0 | 2.0 |
| 10 | KMeans_1 | 5 | 0.292447 | 2126.159451 | 1.042950 | 3.0 | 3.0 | 3.0 |
| 1 | KMeans_0 | 5 | 0.287581 | 2073.836363 | 1.056323 | 4.0 | 4.0 | 4.0 |
| 11 | KMeans_1 | 6 | 0.280849 | 2013.379981 | 1.147322 | 5.0 | 5.0 | 7.0 |
| 2 | KMeans_0 | 6 | 0.277204 | 1956.969284 | 1.148654 | 6.0 | 6.0 | 8.0 |
| 16 | BIRCH_1 | 5 | 0.256525 | 1473.528706 | 1.146042 | 7.0 | 8.0 | 6.0 |
| 45 | KMeans_5 | 4 | 0.255961 | 1476.324037 | 1.258837 | 8.0 | 7.0 | 13.0 |
| 17 | BIRCH_1 | 6 | 0.245541 | 1267.700712 | 1.093469 | 9.0 | 13.0 | 5.0 |
| 46 | KMeans_5 | 5 | 0.235623 | 1446.240287 | 1.195127 | 10.0 | 9.0 | 10.0 |
| 18 | KMeans_2 | 4 | 0.229442 | 1312.496426 | 1.318267 | 11.0 | 12.0 | 16.0 |
| 19 | KMeans_2 | 5 | 0.229186 | 1325.925305 | 1.188906 | 12.0 | 11.0 | 9.0 |
| 15 | BIRCH_1 | 4 | 0.227406 | 1011.502651 | 1.272033 | 13.0 | 19.0 | 14.0 |
| 47 | KMeans_5 | 6 | 0.220333 | 1333.678145 | 1.236709 | 14.0 | 10.0 | 11.0 |
| 51 | BIRCH_5 | 4 | 0.218221 | 847.756236 | 1.525069 | 15.0 | 26.0 | 26.0 |
| 20 | KMeans_2 | 6 | 0.216024 | 1231.020676 | 1.238758 | 16.0 | 14.0 | 12.0 |
| 36 | KMeans_4 | 4 | 0.206618 | 1116.872652 | 1.507344 | 17.0 | 16.0 | 24.0 |
| 37 | KMeans_4 | 5 | 0.199048 | 1088.940431 | 1.388762 | 18.0 | 17.0 | 20.0 |
| 38 | KMeans_4 | 6 | 0.192879 | 1037.995656 | 1.319240 | 19.0 | 18.0 | 17.0 |
| 6 | BIRCH_0 | 4 | 0.183495 | 1153.152529 | 1.469112 | 20.0 | 15.0 | 22.0 |
| 29 | KMeans_3 | 6 | 0.174514 | 875.169065 | 1.359543 | 21.0 | 25.0 | 18.0 |
| 27 | KMeans_3 | 4 | 0.174485 | 906.553422 | 1.625089 | 22.0 | 22.0 | 27.0 |
| 28 | KMeans_3 | 5 | 0.171449 | 890.484183 | 1.486904 | 23.0 | 24.0 | 23.0 |
| 7 | BIRCH_0 | 5 | 0.171126 | 972.944927 | 1.380148 | 24.0 | 20.0 | 19.0 |
| 3 | GaussianMixture_0 | 4 | 0.166575 | 724.882592 | 2.207772 | 25.0 | 34.0 | 38.0 |
| 8 | BIRCH_0 | 6 | 0.160106 | 900.991574 | 1.286046 | 26.0 | 23.0 | 15.0 |
| 25 | BIRCH_2 | 5 | 0.150888 | 931.010243 | 1.446006 | 27.0 | 21.0 | 21.0 |
| 53 | BIRCH_5 | 6 | 0.144772 | 747.700516 | 1.697960 | 28.0 | 33.0 | 29.0 |
| 43 | BIRCH_4 | 5 | 0.144743 | 790.607165 | 1.652121 | 29.0 | 28.0 | 28.0 |
| 42 | BIRCH_4 | 4 | 0.143981 | 773.585221 | 1.786905 | 30.0 | 32.0 | 33.0 |
| 26 | BIRCH_2 | 6 | 0.142779 | 816.455222 | 1.511324 | 31.0 | 27.0 | 25.0 |
| 52 | BIRCH_5 | 5 | 0.141235 | 790.499112 | 1.841085 | 32.0 | 29.0 | 34.0 |
| 48 | GaussianMixture_5 | 4 | 0.129758 | 519.826340 | 1.950710 | 33.0 | 43.0 | 35.0 |
| 24 | BIRCH_2 | 4 | 0.116163 | 775.001256 | 1.699846 | 34.0 | 30.0 | 30.0 |
| 50 | GaussianMixture_5 | 6 | 0.113147 | 499.843052 | 3.135536 | 35.0 | 45.0 | 46.0 |
| 44 | BIRCH_4 | 6 | 0.112937 | 714.859417 | 1.713638 | 36.0 | 35.0 | 31.0 |
| 12 | GaussianMixture_1 | 4 | 0.109077 | 774.341078 | 3.003030 | 37.0 | 31.0 | 45.0 |
| 5 | GaussianMixture_0 | 6 | 0.097793 | 484.899529 | 3.245366 | 38.0 | 46.0 | 49.0 |
| 35 | BIRCH_3 | 6 | 0.097584 | 590.950027 | 1.732901 | 39.0 | 40.0 | 32.0 |
| 4 | GaussianMixture_0 | 5 | 0.097399 | 523.670482 | 2.651801 | 40.0 | 42.0 | 43.0 |
| 13 | GaussianMixture_1 | 5 | 0.094252 | 660.343709 | 2.433431 | 41.0 | 36.0 | 40.0 |
| 34 | BIRCH_3 | 5 | 0.092288 | 595.297894 | 2.002667 | 42.0 | 39.0 | 36.0 |
| 33 | BIRCH_3 | 4 | 0.088822 | 596.032294 | 2.224831 | 43.0 | 38.0 | 39.0 |
| 49 | GaussianMixture_5 | 5 | 0.081307 | 429.555716 | 2.984285 | 44.0 | 48.0 | 44.0 |
| 21 | GaussianMixture_2 | 4 | 0.074657 | 533.125491 | 3.626389 | 45.0 | 41.0 | 53.0 |
| 30 | GaussianMixture_3 | 4 | 0.062698 | 466.430638 | 2.557749 | 46.0 | 47.0 | 42.0 |
| 22 | GaussianMixture_2 | 5 | 0.054073 | 428.930210 | 3.145821 | 47.0 | 49.0 | 47.0 |
| 39 | GaussianMixture_4 | 4 | 0.052981 | 509.992953 | 2.497536 | 48.0 | 44.0 | 41.0 |
| 40 | GaussianMixture_4 | 5 | 0.029443 | 334.914357 | 3.558019 | 49.0 | 52.0 | 52.0 |
| 14 | GaussianMixture_1 | 6 | 0.021204 | 599.533967 | 2.078100 | 50.0 | 37.0 | 37.0 |
passthrough_features = []
standardscale_features = ['recency', 'review_score']
log1pscale_features = ['order_payment_total']
logscale_features = []
preprocessor_X = create_preprocessor_X(logscale_features = logscale_features,
standardscale_features = standardscale_features,
log1pscale_features = log1pscale_features)
X = customers_df[logscale_features + log1pscale_features + standardscale_features + passthrough_features].dropna().copy()
X_scaled = preprocessor_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index = X.index, columns = X.columns).astype(float)
X_reduced = X_scaled.sample(frac=0.05, random_state=seed).astype(np.float32)
all_components = len(X_scaled.columns)
from sklearn.neighbors import kneighbors_graph
knn_graph = kneighbors_graph(X_scaled, 30, include_self=False)
agg = AgglomerativeClustering(n_clusters=4, connectivity=knn_graph, linkage='ward', compute_distances=True)
#X_train = X_scaled_0.sample(frac=0.45, random_state=42).astype(np.float32)
X_labels = agg.fit_predict(X_scaled)
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
plt.figure(figsize=(15,5))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top levels of the dendrogram
plot_dendrogram(agg, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
agg.__dict__
{'n_clusters': 4,
'distance_threshold': None,
'memory': None,
'connectivity': <96088x96088 sparse matrix of type '<class 'numpy.float64'>'
with 2882640 stored elements in Compressed Sparse Row format>,
'compute_full_tree': 'auto',
'linkage': 'ward',
'affinity': 'euclidean',
'compute_distances': True,
'n_features_in_': 4,
'children_': array([[ 35768, 62330],
[ 79821, 96024],
[ 40770, 45097],
...,
[192169, 192170],
[192171, 192172],
[192167, 192173]], dtype=int64),
'n_connected_components_': 1,
'n_leaves_': 96088,
'distances_': array([0.00000000e+00, 6.79083262e-07, 7.54536958e-07, ...,
2.62896278e+02, 3.20175475e+02, 3.66977986e+02]),
'n_clusters_': 4,
'labels_': array([0, 1, 1, ..., 3, 1, 2], dtype=int64)}